github_mongoDB_dataprocess/get_refer_repos.py

90 lines
3.8 KiB
Python

from pymongo import MongoClient
from github import Github
import re
import time
def match_repos(data,text):
global reference_repo_set
for event in data:
pattern = r'github.com/[a-zA-Z0-9-_.]+/[a-zA-Z0-9-_.]+/issues/[0-9]+|github.com/[a-zA-Z0-9-_.]+/[a-zA-Z0-9-_.]+/pull/[0-9]+'
matches = re.findall(pattern, str(event[text]))
for match in matches:
positions = [index for index, char in enumerate(match) if char == '/']
repo = match[positions[0] + 1:positions[2]]
reference_repo_set.add(repo)
#初始化
reference_repo_set = set()
def get_refer(clt1,clt2):
global reference_repo_set
#连接数据库
client = MongoClient('localhost', 27017)
db = client['numpy_db']
collection1 = db[clt1]
collection2 = db[clt2]
#信息获取
commented_data = collection1.find({"event": "commented"},{"body": 1,"issue_number":1, "_id": 0}) #评论
match_repos(commented_data,'body')
cross_referenced_data = collection1.find({"event": "cross-referenced"},{"source": 1,"issue_number":1, "_id": 0}) #交叉引用
for event in cross_referenced_data:
refer = event['source']['issue']['url']
positions = [index for index, char in enumerate(refer) if char == '/']
repo = refer[positions[-4] + 1:positions[-2]]
reference_repo_set.add(repo)
body_data = collection2.find(projection={"body": 1,"title":1,"number":1, "_id": 0}) #标题和描述
match_repos(body_data,'body')
match_repos(body_data,'title')
return None
get_refer('issue_timeline_new','issue_pr')
fork_repo = ['Gabbi68/numpy', 'DD2480-Group-15/numpy', 'seberg/numpy', 'hpyproject/numpy-hpy', 'xman/numpy-posit', 'Mukulikaa/numpy', 'George-Bassilious/numpy', 'plctlab/numpy', 'lvcarlosja/numpy']
# for repo in fork_repo:
# get_refer(repo.replace('/','_')+'_timeline',repo.replace('/','_')+'_issue&pr')
reference_repo_list = list(reference_repo_set)
reference_repo_list.sort()
final_reference_repo_set = set()
drop_list = ['Paradigm4/SciDB-Py','Russell91/apollo','chinminghuang/numpy','luvsound/pippi','numpy/numpy']
drop_list = drop_list + fork_repo
for i in drop_list:
if i in reference_repo_list:
reference_repo_list.remove(i)
print(len(reference_repo_list))
# repository实体识别 2081->2052(含numpy/numpy),2078->2049
token_dict = {"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":1,"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":2,"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":3,
"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":4,"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":5,"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":6,
"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":7}
access_token = "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"
cnt = 0
for repository_name in reference_repo_list:
if not cnt%100:
access_token = max(token_dict, key=token_dict.get)
g = Github(access_token)
print(cnt, end=' ')
# print(repository_name,end=" ")
repository = g.get_repo(repository_name)
url = str(repository.url)
positions = [index for index, char in enumerate(url) if char == '/']
repo = url[positions[-2] + 1::] #同一实体识别
final_reference_repo_set.add(repo)
rate_limit = g.rate_limiting
token_dict[access_token] = rate_limit[0]/(g.rate_limiting_resettime - time.time())
cnt += 1
final_reference_repo_list = list(final_reference_repo_set)
final_reference_repo_list.sort()
print(len(final_reference_repo_list))
# with open('numpy_refer_repo_name.txt','w',encoding='utf-8',newline='') as f:
# f.write(str(final_reference_repo_list))
with open('numpy_refer_repo_name(x).txt','w',encoding='utf-8',newline='') as f:
f.write(str(final_reference_repo_list))