90 lines
3.8 KiB
Python
90 lines
3.8 KiB
Python
|
|
|
|
from pymongo import MongoClient
|
|
from github import Github
|
|
import re
|
|
import time
|
|
def match_repos(data,text):
|
|
global reference_repo_set
|
|
for event in data:
|
|
pattern = r'github.com/[a-zA-Z0-9-_.]+/[a-zA-Z0-9-_.]+/issues/[0-9]+|github.com/[a-zA-Z0-9-_.]+/[a-zA-Z0-9-_.]+/pull/[0-9]+'
|
|
matches = re.findall(pattern, str(event[text]))
|
|
for match in matches:
|
|
positions = [index for index, char in enumerate(match) if char == '/']
|
|
repo = match[positions[0] + 1:positions[2]]
|
|
reference_repo_set.add(repo)
|
|
#初始化
|
|
reference_repo_set = set()
|
|
def get_refer(clt1,clt2):
|
|
|
|
global reference_repo_set
|
|
|
|
#连接数据库
|
|
client = MongoClient('localhost', 27017)
|
|
db = client['numpy_db']
|
|
collection1 = db[clt1]
|
|
collection2 = db[clt2]
|
|
|
|
#信息获取
|
|
commented_data = collection1.find({"event": "commented"},{"body": 1,"issue_number":1, "_id": 0}) #评论
|
|
match_repos(commented_data,'body')
|
|
|
|
cross_referenced_data = collection1.find({"event": "cross-referenced"},{"source": 1,"issue_number":1, "_id": 0}) #交叉引用
|
|
for event in cross_referenced_data:
|
|
refer = event['source']['issue']['url']
|
|
positions = [index for index, char in enumerate(refer) if char == '/']
|
|
repo = refer[positions[-4] + 1:positions[-2]]
|
|
reference_repo_set.add(repo)
|
|
|
|
body_data = collection2.find(projection={"body": 1,"title":1,"number":1, "_id": 0}) #标题和描述
|
|
match_repos(body_data,'body')
|
|
match_repos(body_data,'title')
|
|
return None
|
|
|
|
get_refer('issue_timeline_new','issue_pr')
|
|
fork_repo = ['Gabbi68/numpy', 'DD2480-Group-15/numpy', 'seberg/numpy', 'hpyproject/numpy-hpy', 'xman/numpy-posit', 'Mukulikaa/numpy', 'George-Bassilious/numpy', 'plctlab/numpy', 'lvcarlosja/numpy']
|
|
# for repo in fork_repo:
|
|
# get_refer(repo.replace('/','_')+'_timeline',repo.replace('/','_')+'_issue&pr')
|
|
|
|
reference_repo_list = list(reference_repo_set)
|
|
reference_repo_list.sort()
|
|
final_reference_repo_set = set()
|
|
drop_list = ['Paradigm4/SciDB-Py','Russell91/apollo','chinminghuang/numpy','luvsound/pippi','numpy/numpy']
|
|
drop_list = drop_list + fork_repo
|
|
for i in drop_list:
|
|
if i in reference_repo_list:
|
|
reference_repo_list.remove(i)
|
|
print(len(reference_repo_list))
|
|
|
|
# repository实体识别 2081->2052(含numpy/numpy),2078->2049
|
|
token_dict = {"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":1,"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":2,"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":3,
|
|
"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":4,"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":5,"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":6,
|
|
"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":7}
|
|
access_token = "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"
|
|
cnt = 0
|
|
|
|
for repository_name in reference_repo_list:
|
|
|
|
if not cnt%100:
|
|
access_token = max(token_dict, key=token_dict.get)
|
|
g = Github(access_token)
|
|
print(cnt, end=' ')
|
|
# print(repository_name,end=" ")
|
|
repository = g.get_repo(repository_name)
|
|
url = str(repository.url)
|
|
positions = [index for index, char in enumerate(url) if char == '/']
|
|
repo = url[positions[-2] + 1::] #同一实体识别
|
|
final_reference_repo_set.add(repo)
|
|
rate_limit = g.rate_limiting
|
|
token_dict[access_token] = rate_limit[0]/(g.rate_limiting_resettime - time.time())
|
|
cnt += 1
|
|
|
|
final_reference_repo_list = list(final_reference_repo_set)
|
|
final_reference_repo_list.sort()
|
|
print(len(final_reference_repo_list))
|
|
# with open('numpy_refer_repo_name.txt','w',encoding='utf-8',newline='') as f:
|
|
# f.write(str(final_reference_repo_list))
|
|
|
|
with open('numpy_refer_repo_name(x).txt','w',encoding='utf-8',newline='') as f:
|
|
f.write(str(final_reference_repo_list))
|