734 lines
33 KiB
Python
734 lines
33 KiB
Python
#
|
||
# from:
|
||
# to:
|
||
# attr:
|
||
# time:
|
||
|
||
|
||
import os
|
||
import pandas as pd
|
||
import json
|
||
import collections
|
||
import pymongo
|
||
import ast
|
||
from get_entity_json import get_issues_and_users
|
||
|
||
def get_fork_json():
|
||
tmp = []
|
||
file_names = os.listdir('C:/Users/zhou/Desktop/github_mongoDB/fork_issue_number')
|
||
for file_name in file_names:
|
||
file_path = os.path.join('C:/Users/zhou/Desktop/github_mongoDB/fork_issue_number', file_name)
|
||
df = pd.read_excel(file_path)
|
||
filtered_repos = df[df['number'] > 0]['repo_name'].tolist()
|
||
forked_repo = file_name[:file_name.find('_fork_repo_issues')]
|
||
for repos in filtered_repos:
|
||
tmp.append({
|
||
"from":repos.replace('/','_'),
|
||
"to":forked_repo
|
||
})
|
||
with open('./json/relationship/fork.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
|
||
def get_belongto_json(size):
|
||
|
||
# 初始化
|
||
cnt = 0
|
||
tmp = []
|
||
issue_set = collections.defaultdict(set)
|
||
|
||
# 数据库链接
|
||
client = pymongo.MongoClient("mongodb://localhost:27017/")
|
||
# numpy及其fork
|
||
db = client['numpy_db']
|
||
for collection_name in db.list_collection_names():
|
||
if 'issue&pr' in collection_name:
|
||
name = collection_name[:collection_name.find('issue&pr') - 1]
|
||
collection = db[collection_name]
|
||
issues = collection.find(
|
||
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}},
|
||
{'number': 1, 'user': 1,'url':1, '_id': 0})
|
||
for issue in issues:
|
||
# 筛掉重复的issue&pr
|
||
if issue['number'] in issue_set[name]:
|
||
continue
|
||
issue_set[name].add(issue['number'])
|
||
tmp.append({
|
||
"from":name +"+"+ str(issue['number']),
|
||
"to":name
|
||
})
|
||
cnt += 1
|
||
if not cnt%size:
|
||
with open('./json/relationship/belongto'+str(cnt//size)+'.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
tmp = []
|
||
|
||
# numpy及其fork的上下游
|
||
with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
|
||
repo_list = set(ast.literal_eval(f.read()))
|
||
db = client['reference_db']
|
||
for collection_name in db.list_collection_names():
|
||
# 删除数据库中非上下游的仓库的信息
|
||
if collection_name.replace('_', '/', 1) not in repo_list:
|
||
continue
|
||
collection = db[collection_name]
|
||
issues = collection.find(
|
||
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}},
|
||
{'number': 1, 'user': 1,'url':1, '_id': 0})
|
||
for issue in issues:
|
||
if issue['number'] in issue_set[collection_name]:
|
||
continue
|
||
issue_set[collection_name].add(issue['number'])
|
||
tmp.append({
|
||
"from": collection_name +"+"+ str(issue['number']),
|
||
"to": collection_name
|
||
})
|
||
cnt += 1
|
||
if not cnt % size:
|
||
with open('./json/relationship/belongto' + str(cnt // size) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
tmp = []
|
||
|
||
# 上下游的fork
|
||
db = client['fork_db']
|
||
for collection_name in db.list_collection_names():
|
||
collection = db[collection_name]
|
||
issues = collection.find(
|
||
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}},
|
||
{'number': 1, 'user': 1,'url':1, '_id': 0})
|
||
for issue in issues:
|
||
if issue['number'] in issue_set[collection_name]:
|
||
continue
|
||
issue_set[collection_name].add(issue['number'])
|
||
tmp.append({
|
||
"from": collection_name +"+"+ str(issue['number']),
|
||
"to": collection_name
|
||
})
|
||
cnt += 1
|
||
if not cnt % size:
|
||
with open('./json/relationship/belongto' + str(cnt // size) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
tmp = []
|
||
if tmp:
|
||
with open('./json/relationship/belongto' + str(cnt // size+1) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
|
||
print(cnt)
|
||
return None
|
||
|
||
def get_mentionedin_closed_json(issue_set,pr_set,event_type,actor,rela,size):
|
||
|
||
tmp = []
|
||
cnt = 0
|
||
# 数据库链接
|
||
client = pymongo.MongoClient("mongodb://localhost:27017/")
|
||
# numpy及其fork
|
||
db = client['numpy_db']
|
||
for collection_name in db.list_collection_names():
|
||
if '_timeline' in collection_name:
|
||
name = collection_name[:collection_name.find('_timeline')]
|
||
collection = db[collection_name]
|
||
# 取出的事件满足条件:issue建立于2024年3月之前;事件类型符合,事件发生在2024年3月之前
|
||
events = collection.find({"issue_number": {"$in":list(issue_set[name])+list(pr_set[name])},
|
||
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
|
||
'event':event_type},{actor: 1,'issue_number':1,'created_at':1, '_id': 0})
|
||
#存在部分事件对象信息缺失
|
||
for event in events:
|
||
try:
|
||
tmp.append({
|
||
"from":event[actor]['id'],
|
||
"to":name +"+"+ str(event['issue_number']),
|
||
"time":event['created_at']
|
||
})
|
||
cnt += 1
|
||
if not cnt % size:
|
||
with open('./json/relationship/'+rela+ str(cnt // size) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
tmp = []
|
||
except Exception as e:
|
||
pass
|
||
# numpy及其fork的上下游
|
||
with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
|
||
repo_list = set(ast.literal_eval(f.read()))
|
||
db = client['reference_timeline']
|
||
for collection_name in db.list_collection_names():
|
||
# 删除数据库中非上下游的仓库的信息
|
||
if collection_name.replace('_', '/', 1) not in repo_list:
|
||
continue
|
||
collection = db[collection_name]
|
||
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
|
||
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
|
||
'event': event_type}, {actor: 1,'issue_number':1,'created_at':1, '_id': 0})
|
||
for event in events:
|
||
try:
|
||
tmp.append({
|
||
"from": event[actor]['id'],
|
||
"to": collection_name +"+"+ str(event['issue_number']),
|
||
"time": event['created_at']
|
||
})
|
||
cnt += 1
|
||
if not cnt % size:
|
||
with open('./json/relationship/'+rela + str(cnt // size) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
tmp = []
|
||
except Exception as e:
|
||
pass
|
||
# 上下游的fork
|
||
db = client['fork_reference']
|
||
for collection_name in db.list_collection_names():
|
||
collection = db[collection_name]
|
||
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
|
||
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
|
||
'event': event_type}, {actor: 1,'issue_number':1,'created_at':1, '_id': 0})
|
||
for event in events:
|
||
try:
|
||
tmp.append({
|
||
"from": event[actor]['id'],
|
||
"to": collection_name +"+"+ str(event['issue_number']),
|
||
"time": event['created_at']
|
||
})
|
||
cnt += 1
|
||
if not cnt % size:
|
||
with open('./json/relationship/'+rela + str(cnt // size) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
tmp = []
|
||
except Exception as e:
|
||
pass
|
||
if tmp:
|
||
with open('./json/relationship/'+rela + str(cnt // size+1) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
|
||
def get_opened_json(size):
|
||
|
||
# 初始化
|
||
cnt = 0
|
||
tmp = []
|
||
issue_set = collections.defaultdict(set)
|
||
|
||
# 数据库链接
|
||
client = pymongo.MongoClient("mongodb://localhost:27017/")
|
||
# numpy及其fork
|
||
db = client['numpy_db']
|
||
for collection_name in db.list_collection_names():
|
||
if 'issue&pr' in collection_name:
|
||
name = collection_name[:collection_name.find('issue&pr') - 1]
|
||
collection = db[collection_name]
|
||
issues = collection.find(
|
||
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}},
|
||
{'number': 1, 'user': 1,'url':1,'created_at':1, '_id': 0})
|
||
for issue in issues:
|
||
# 筛掉重复的issue&pr
|
||
if issue['number'] in issue_set[name]:
|
||
continue
|
||
issue_set[name].add(issue['number'])
|
||
tmp.append({
|
||
"from":issue['user']['id'],
|
||
"to":name +"+"+ str(issue['number']),
|
||
"time":issue['created_at']
|
||
})
|
||
cnt += 1
|
||
if not cnt%size:
|
||
with open('./json/relationship/opened'+str(cnt//size)+'.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
tmp = []
|
||
|
||
# numpy及其fork的上下游
|
||
with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
|
||
repo_list = set(ast.literal_eval(f.read()))
|
||
db = client['reference_db']
|
||
for collection_name in db.list_collection_names():
|
||
# 删除数据库中非上下游的仓库的信息
|
||
if collection_name.replace('_', '/', 1) not in repo_list:
|
||
continue
|
||
collection = db[collection_name]
|
||
issues = collection.find(
|
||
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}},
|
||
{'number': 1, 'user': 1,'url':1,'created_at':1, '_id': 0})
|
||
for issue in issues:
|
||
if issue['number'] in issue_set[collection_name]:
|
||
continue
|
||
issue_set[collection_name].add(issue['number'])
|
||
tmp.append({
|
||
"from": issue['user']['id'],
|
||
"to": collection_name +"+"+ str(issue['number']),
|
||
"time": issue['created_at']
|
||
})
|
||
cnt += 1
|
||
if not cnt % size:
|
||
with open('./json/relationship/opened' + str(cnt // size) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
tmp = []
|
||
|
||
# 上下游的fork
|
||
db = client['fork_db']
|
||
for collection_name in db.list_collection_names():
|
||
collection = db[collection_name]
|
||
issues = collection.find(
|
||
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}},
|
||
{'number': 1, 'user': 1,'url':1,'created_at':1, '_id': 0})
|
||
for issue in issues:
|
||
if issue['number'] in issue_set[collection_name]:
|
||
continue
|
||
issue_set[collection_name].add(issue['number'])
|
||
tmp.append({
|
||
"from": issue['user']['id'],
|
||
"to": collection_name +"+"+ str(issue['number']),
|
||
"time": issue['created_at']
|
||
})
|
||
cnt += 1
|
||
if not cnt % size:
|
||
with open('./json/relationship/opened' + str(cnt // size) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
tmp = []
|
||
if tmp:
|
||
with open('./json/relationship/opened' + str(cnt // size+1) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
|
||
print(cnt)
|
||
return None
|
||
|
||
def get_commented_json(issue_set,pr_set,size):
|
||
|
||
tmp = []
|
||
cnt = 0
|
||
# 数据库链接
|
||
client = pymongo.MongoClient("mongodb://localhost:27017/")
|
||
# numpy及其fork
|
||
db = client['numpy_db']
|
||
for collection_name in db.list_collection_names():
|
||
if '_timeline' in collection_name:
|
||
name = collection_name[:collection_name.find('_timeline')]
|
||
collection = db[collection_name]
|
||
# 取出的事件满足条件:issue建立于2024年3月之前;事件类型符合,事件发生在2024年3月之前
|
||
events = collection.find({"issue_number": {"$in":list(issue_set[name])+list(pr_set[name])},
|
||
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
|
||
'event':'commented'},{'user': 1,'issue_number':1,'created_at':1,'body':1, 'author_association':1, '_id': 0})
|
||
#存在部分事件对象信息缺失
|
||
for event in events:
|
||
try:
|
||
tmp.append({
|
||
"from":event['user']['id'],
|
||
"to":name +"+"+ str(event['issue_number']),
|
||
"time":event['created_at'],
|
||
"body":event['body'],
|
||
"author_association":event['author_association']
|
||
})
|
||
cnt += 1
|
||
if not cnt % size:
|
||
with open('./json/relationship/'+'commented'+ str(cnt // size) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
tmp = []
|
||
except Exception as e:
|
||
pass
|
||
# numpy及其fork的上下游
|
||
with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
|
||
repo_list = set(ast.literal_eval(f.read()))
|
||
db = client['reference_timeline']
|
||
for collection_name in db.list_collection_names():
|
||
# 删除数据库中非上下游的仓库的信息
|
||
if collection_name.replace('_', '/', 1) not in repo_list:
|
||
continue
|
||
collection = db[collection_name]
|
||
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
|
||
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
|
||
'event': 'commented'}, {'user': 1,'issue_number':1,'created_at':1,'body':1, 'author_association':1, '_id': 0})
|
||
for event in events:
|
||
try:
|
||
tmp.append({
|
||
"from": event['user']['id'],
|
||
"to": collection_name +"+"+ str(event['issue_number']),
|
||
"time": event['created_at'],
|
||
"body":event['body'],
|
||
"author_association":event['author_association']
|
||
})
|
||
cnt += 1
|
||
if not cnt % size:
|
||
with open('./json/relationship/'+'commented' + str(cnt // size) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
tmp = []
|
||
except Exception as e:
|
||
pass
|
||
# 上下游的fork
|
||
db = client['fork_reference']
|
||
for collection_name in db.list_collection_names():
|
||
collection = db[collection_name]
|
||
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
|
||
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
|
||
'event': 'commented'}, {'user': 1,'issue_number':1,'created_at':1,'body':1, 'author_association':1, '_id': 0})
|
||
for event in events:
|
||
try:
|
||
tmp.append({
|
||
"from": event['user']['id'],
|
||
"to": collection_name +"+"+ str(event['issue_number']),
|
||
"time": event['created_at'],
|
||
"body":event['body'],
|
||
"author_association":event['author_association']
|
||
})
|
||
cnt += 1
|
||
if not cnt % size:
|
||
with open('./json/relationship/'+'commented' + str(cnt // size) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
tmp = []
|
||
except Exception as e:
|
||
pass
|
||
if tmp:
|
||
with open('./json/relationship/'+'commented' + str(cnt // size+1) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
print(cnt,'commented')
|
||
|
||
def get_link_depend_json(issue_set,pr_set,size):
|
||
|
||
tmp = []
|
||
cnt = 0
|
||
repo_rely = collections.defaultdict(set)
|
||
all_repos = {i for i in issue_set.keys() if issue_set[i]}.union({i for i in pr_set.keys() if pr_set[i]})
|
||
# 数据库链接
|
||
client = pymongo.MongoClient("mongodb://localhost:27017/")
|
||
# numpy及其fork
|
||
db = client['numpy_db']
|
||
for collection_name in db.list_collection_names():
|
||
if '_timeline' in collection_name:
|
||
name = collection_name[:collection_name.find('_timeline')]
|
||
collection = db[collection_name]
|
||
# 取出的事件满足条件:issue建立于2024年3月之前;事件类型符合,事件发生在2024年3月之前
|
||
events = collection.find({"issue_number": {"$in":list(issue_set[name])+list(pr_set[name])},
|
||
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
|
||
'event':'cross-referenced'},{'source': 1,'issue_number':1,'created_at':1,'actor':1,'_id': 0})
|
||
#存在部分事件对象信息缺失
|
||
for event in events:
|
||
try:
|
||
repo = event['source']['issue']['repository_url'][event['source']['issue']['repository_url'].rfind('/',0,event['source']['issue']['repository_url'].rfind('/'))+1:].replace('/','_')
|
||
#筛掉repo节点外的关系
|
||
if repo not in all_repos:
|
||
continue
|
||
tmp.append({
|
||
"from":repo+"+"+ str(event['source']['issue']['number']),
|
||
"to":name +"+"+ str(event['issue_number']),
|
||
"actor":event['actor']['id'],
|
||
"time":event['created_at']
|
||
})
|
||
repo_rely[repo].add(name)
|
||
cnt += 1
|
||
if not cnt % size:
|
||
with open('./json/relationship/'+'link'+ str(cnt // size) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
tmp = []
|
||
except Exception as e:
|
||
pass
|
||
# numpy及其fork的上下游
|
||
with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
|
||
repo_list = set(ast.literal_eval(f.read()))
|
||
db = client['reference_timeline']
|
||
for collection_name in db.list_collection_names():
|
||
# 删除数据库中非上下游的仓库的信息
|
||
if collection_name.replace('_', '/', 1) not in repo_list:
|
||
continue
|
||
collection = db[collection_name]
|
||
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
|
||
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
|
||
'event': 'cross-referenced'}, {'source': 1,'issue_number':1,'created_at':1,'actor':1,'_id': 0})
|
||
for event in events:
|
||
try:
|
||
repo = event['source']['issue']['repository_url'].rfind('/', 0, event['source']['issue'][
|
||
'repository_url'].rfind('/')).replace('/', '_')
|
||
if repo not in all_repos:
|
||
continue
|
||
tmp.append({
|
||
"from": repo +"+"+ str(event['source']['issue']['number']),
|
||
"to": collection_name +"+"+ str(event['issue_number']),
|
||
"actor": event['actor']['id'],
|
||
"time": event['created_at']
|
||
})
|
||
repo_rely[repo].add(collection_name)
|
||
cnt += 1
|
||
if not cnt % size:
|
||
with open('./json/relationship/' + 'link' + str(cnt // size) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
tmp = []
|
||
except Exception as e:
|
||
pass
|
||
# 上下游的fork
|
||
db = client['fork_reference']
|
||
for collection_name in db.list_collection_names():
|
||
collection = db[collection_name]
|
||
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
|
||
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
|
||
'event': 'cross-referenced'}, {'source': 1,'issue_number':1,'created_at':1,'actor':1,'_id': 0})
|
||
for event in events:
|
||
try:
|
||
repo = event['source']['issue']['repository_url'].rfind('/', 0, event['source']['issue'][
|
||
'repository_url'].rfind('/')).replace('/', '_')
|
||
if repo not in all_repos:
|
||
continue
|
||
tmp.append({
|
||
"from": repo +"+"+ str(event['source']['issue']['number']),
|
||
"to": collection_name +"+"+ str(event['issue_number']),
|
||
"actor": event['actor']['id'],
|
||
"time": event['created_at']
|
||
})
|
||
repo_rely[repo].add(collection_name)
|
||
cnt += 1
|
||
if not cnt % size:
|
||
with open('./json/relationship/' + 'link' + str(cnt // size) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
tmp = []
|
||
except Exception as e:
|
||
pass
|
||
if tmp:
|
||
with open('./json/relationship/'+'link' + str(cnt // size+1) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
print(cnt,'link')
|
||
#depend
|
||
depend = []
|
||
for repo in repo_rely.keys():
|
||
for linked_repo in repo_rely[repo]:
|
||
depend.append({
|
||
"from":repo,
|
||
"to":linked_repo
|
||
})
|
||
if not len(depend) % size:
|
||
with open('./json/relationship/' + 'depend' + str(len(depend) // size) + '.json', 'w') as f:
|
||
json.dump(depend, f, indent=4)
|
||
depend = []
|
||
if depend:
|
||
with open('./json/relationship/' + 'depend' + str(len(depend) // size + 1) + '.json', 'w') as f:
|
||
json.dump(depend, f, indent=4)
|
||
print(len(depend),'depend')
|
||
def get_labeled_json(issue_set,pr_set,size):
|
||
|
||
tmp = []
|
||
cnt = 0
|
||
# 数据库链接
|
||
client = pymongo.MongoClient("mongodb://localhost:27017/")
|
||
# numpy及其fork
|
||
db = client['numpy_db']
|
||
for collection_name in db.list_collection_names():
|
||
if '_timeline' in collection_name:
|
||
name = collection_name[:collection_name.find('_timeline')]
|
||
collection = db[collection_name]
|
||
# 取出的事件满足条件:issue建立于2024年3月之前;事件类型符合,事件发生在2024年3月之前
|
||
events = collection.find({"issue_number": {"$in":list(issue_set[name])+list(pr_set[name])},
|
||
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
|
||
'event':'labeled'},{'actor': 1,'issue_number':1,'created_at':1, 'label':1, '_id': 0})
|
||
#存在部分事件对象信息缺失
|
||
for event in events:
|
||
try:
|
||
tmp.append({
|
||
"from":event['actor']['id'],
|
||
"to":name +"+"+ str(event['issue_number']),
|
||
"time":event['created_at'],
|
||
"label":event['label']['name']
|
||
})
|
||
cnt += 1
|
||
if not cnt % size:
|
||
with open('./json/relationship/'+'labeled'+ str(cnt // size) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
tmp = []
|
||
except Exception as e:
|
||
pass
|
||
# numpy及其fork的上下游
|
||
with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
|
||
repo_list = set(ast.literal_eval(f.read()))
|
||
db = client['reference_timeline']
|
||
for collection_name in db.list_collection_names():
|
||
# 删除数据库中非上下游的仓库的信息
|
||
if collection_name.replace('_', '/', 1) not in repo_list:
|
||
continue
|
||
collection = db[collection_name]
|
||
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
|
||
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
|
||
'event':'labeled'},{'actor': 1,'issue_number':1,'created_at':1, 'label':1, '_id': 0})
|
||
for event in events:
|
||
try:
|
||
tmp.append({
|
||
"from": event['actor']['id'],
|
||
"to": collection_name +"+"+ str(event['issue_number']),
|
||
"time": event['created_at'],
|
||
"label": event['label']['name']
|
||
})
|
||
cnt += 1
|
||
if not cnt % size:
|
||
with open('./json/relationship/' + 'labeled' + str(cnt // size) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
tmp = []
|
||
except Exception as e:
|
||
pass
|
||
# 上下游的fork
|
||
db = client['fork_reference']
|
||
for collection_name in db.list_collection_names():
|
||
collection = db[collection_name]
|
||
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
|
||
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
|
||
'event':'labeled'},{'actor': 1,'issue_number':1,'created_at':1, 'label':1, '_id': 0})
|
||
for event in events:
|
||
try:
|
||
tmp.append({
|
||
"from": event['actor']['id'],
|
||
"to": collection_name +"+"+ str(event['issue_number']),
|
||
"time": event['created_at'],
|
||
"label": event['label']['name']
|
||
})
|
||
cnt += 1
|
||
if not cnt % size:
|
||
with open('./json/relationship/' + 'labeled' + str(cnt // size) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
tmp = []
|
||
except Exception as e:
|
||
pass
|
||
if tmp:
|
||
with open('./json/relationship/'+'labeled' + str(cnt // size+1) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
print(cnt,'labeled')
|
||
def get_assigned_json(issue_set,pr_set,size):
|
||
|
||
tmp = []
|
||
cnt = 0
|
||
# 数据库链接
|
||
client = pymongo.MongoClient("mongodb://localhost:27017/")
|
||
# numpy及其fork
|
||
db = client['numpy_db']
|
||
for collection_name in db.list_collection_names():
|
||
if '_timeline' in collection_name:
|
||
name = collection_name[:collection_name.find('_timeline')]
|
||
collection = db[collection_name]
|
||
# 取出的事件满足条件:issue建立于2024年3月之前;事件类型符合,事件发生在2024年3月之前
|
||
events = collection.find({"issue_number": {"$in":list(issue_set[name])+list(pr_set[name])},
|
||
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
|
||
'event':'assigned'},{'actor': 1,'issue_number':1,'created_at':1, 'assignee':1, '_id': 0})
|
||
#存在部分事件对象信息缺失
|
||
for event in events:
|
||
try:
|
||
tmp.append({
|
||
"assigner":event['actor']['id'],
|
||
"assignee":event['assignee']['id'],
|
||
"time":event['created_at'],
|
||
"issue":name +"+"+ str(event['issue_number'])
|
||
})
|
||
cnt += 1
|
||
if not cnt % size:
|
||
with open('./json/relationship/'+'assigned'+ str(cnt //size) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
tmp = []
|
||
except Exception as e:
|
||
pass
|
||
# numpy及其fork的上下游
|
||
with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
|
||
repo_list = set(ast.literal_eval(f.read()))
|
||
db = client['reference_timeline']
|
||
for collection_name in db.list_collection_names():
|
||
# 删除数据库中非上下游的仓库的信息
|
||
if collection_name.replace('_', '/', 1) not in repo_list:
|
||
continue
|
||
collection = db[collection_name]
|
||
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
|
||
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
|
||
'event':'assigned'},{'actor': 1,'issue_number':1,'created_at':1, 'assignee':1, '_id': 0})
|
||
for event in events:
|
||
try:
|
||
tmp.append({
|
||
"assigner": event['actor']['id'],
|
||
"assignee": event['assignee']['id'],
|
||
"time": event['created_at'],
|
||
"issue": collection_name +"+"+ str(event['issue_number'])
|
||
})
|
||
cnt += 1
|
||
if not cnt % size:
|
||
with open('./json/relationship/' + 'assigned' + str(cnt // size) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
tmp = []
|
||
except Exception as e:
|
||
pass
|
||
# 上下游的fork
|
||
db = client['fork_reference']
|
||
for collection_name in db.list_collection_names():
|
||
collection = db[collection_name]
|
||
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
|
||
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
|
||
'event':'assigned'},{'actor': 1,'issue_number':1,'created_at':1, 'assignee':1, '_id': 0})
|
||
for event in events:
|
||
try:
|
||
tmp.append({
|
||
"assigner": event['actor']['id'],
|
||
"assignee": event['assignee']['id'],
|
||
"time": event['created_at'],
|
||
"issue": collection_name +"+"+ str(event['issue_number'])
|
||
})
|
||
cnt += 1
|
||
if not cnt % size:
|
||
with open('./json/relationship/' + 'assigned' + str(cnt // size) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
tmp = []
|
||
except Exception as e:
|
||
pass
|
||
if tmp:
|
||
with open('./json/relationship/'+'assigned' + str(cnt // size+1) + '.json', 'w') as f:
|
||
json.dump(tmp, f, indent=4)
|
||
print(cnt,'assigned')
|
||
|
||
def split_file():
|
||
|
||
with open('./json/relationship/commented161.json','r',encoding='utf-8') as f1:
|
||
data = json.load(f1)
|
||
with open('./json/relationship/commented400.json', 'w') as f2:
|
||
json.dump(data[0:27000], f2, indent=4)
|
||
with open('./json/relationship/commented401.json', 'w') as f3:
|
||
json.dump(data[27000:], f3, indent=4)
|
||
|
||
def test(issue_set,pr_set,size):
|
||
|
||
cnt = 0
|
||
repo_rely = collections.defaultdict(set)
|
||
all_repos = {i for i in issue_set.keys() if issue_set[i]}.union({i for i in pr_set.keys() if pr_set[i]})
|
||
# 数据库链接
|
||
client = pymongo.MongoClient("mongodb://localhost:27017/")
|
||
# numpy及其fork
|
||
db = client['numpy_db']
|
||
for collection_name in db.list_collection_names():
|
||
if '_timeline' in collection_name:
|
||
name = collection_name[:collection_name.find('_timeline')]
|
||
collection = db[collection_name]
|
||
# 取出的事件满足条件:issue建立于2024年3月之前;事件类型符合,事件发生在2024年3月之前
|
||
events = collection.find({"issue_number": {"$in":list(issue_set[name])+list(pr_set[name])},
|
||
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
|
||
'event':'cross-referenced'},{'source': 1,'issue_number':1,'created_at':1,'actor':1,'_id': 0})
|
||
#存在部分事件对象信息缺失
|
||
for event in events:
|
||
try:
|
||
repo = event['source']['issue']['repository_url'][event['source']['issue']['repository_url'].rfind('/',0,event['source']['issue']['repository_url'].rfind('/'))+1:].replace('/','_')
|
||
#筛掉repo节点外的关系
|
||
if repo not in all_repos:
|
||
continue
|
||
repo_rely[repo].add(name)
|
||
if repo == 'bioconda_bioconda-recipes':
|
||
print({
|
||
"from":repo+"+"+ str(event['source']['issue']['number']),
|
||
"to":name +"+"+ str(event['issue_number']),
|
||
"actor":event['actor']['id'],
|
||
"time":event['created_at']
|
||
})
|
||
print(cnt)
|
||
cnt += 1
|
||
|
||
except Exception as e:
|
||
pass
|
||
print(cnt)
|
||
|
||
if __name__ == "__main__":
|
||
|
||
size = 200000
|
||
# get_fork_json()
|
||
# get_belongto_json(size)
|
||
issue_set,pr_set,user_set = get_issues_and_users()
|
||
# get_mentionedin_closed_json(issue_set,pr_set,'mentioned','actor','mentionedin',size)
|
||
# get_mentionedin_closed_json(issue_set,pr_set,'closed','actor','closed',size)
|
||
# get_opened_json(size)
|
||
# get_commented_json(issue_set, pr_set,50000)
|
||
# get_link_depend_json(issue_set, pr_set,size)
|
||
# get_labeled_json(issue_set, pr_set,size)
|
||
# get_assigned_json(issue_set, pr_set,size)
|
||
|
||
# split_file()
|
||
test(issue_set, pr_set, size)
|
||
|
||
|
||
|