github_mongoDB_dataprocess/get_rela_json.py

734 lines
33 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#
# from:
# to:
# attr:
# time:
import os
import pandas as pd
import json
import collections
import pymongo
import ast
from get_entity_json import get_issues_and_users
def get_fork_json():
tmp = []
file_names = os.listdir('C:/Users/zhou/Desktop/github_mongoDB/fork_issue_number')
for file_name in file_names:
file_path = os.path.join('C:/Users/zhou/Desktop/github_mongoDB/fork_issue_number', file_name)
df = pd.read_excel(file_path)
filtered_repos = df[df['number'] > 0]['repo_name'].tolist()
forked_repo = file_name[:file_name.find('_fork_repo_issues')]
for repos in filtered_repos:
tmp.append({
"from":repos.replace('/','_'),
"to":forked_repo
})
with open('./json/relationship/fork.json', 'w') as f:
json.dump(tmp, f, indent=4)
def get_belongto_json(size):
# 初始化
cnt = 0
tmp = []
issue_set = collections.defaultdict(set)
# 数据库链接
client = pymongo.MongoClient("mongodb://localhost:27017/")
# numpy及其fork
db = client['numpy_db']
for collection_name in db.list_collection_names():
if 'issue&pr' in collection_name:
name = collection_name[:collection_name.find('issue&pr') - 1]
collection = db[collection_name]
issues = collection.find(
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}},
{'number': 1, 'user': 1,'url':1, '_id': 0})
for issue in issues:
# 筛掉重复的issue&pr
if issue['number'] in issue_set[name]:
continue
issue_set[name].add(issue['number'])
tmp.append({
"from":name +"+"+ str(issue['number']),
"to":name
})
cnt += 1
if not cnt%size:
with open('./json/relationship/belongto'+str(cnt//size)+'.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
# numpy及其fork的上下游
with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
repo_list = set(ast.literal_eval(f.read()))
db = client['reference_db']
for collection_name in db.list_collection_names():
# 删除数据库中非上下游的仓库的信息
if collection_name.replace('_', '/', 1) not in repo_list:
continue
collection = db[collection_name]
issues = collection.find(
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}},
{'number': 1, 'user': 1,'url':1, '_id': 0})
for issue in issues:
if issue['number'] in issue_set[collection_name]:
continue
issue_set[collection_name].add(issue['number'])
tmp.append({
"from": collection_name +"+"+ str(issue['number']),
"to": collection_name
})
cnt += 1
if not cnt % size:
with open('./json/relationship/belongto' + str(cnt // size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
# 上下游的fork
db = client['fork_db']
for collection_name in db.list_collection_names():
collection = db[collection_name]
issues = collection.find(
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}},
{'number': 1, 'user': 1,'url':1, '_id': 0})
for issue in issues:
if issue['number'] in issue_set[collection_name]:
continue
issue_set[collection_name].add(issue['number'])
tmp.append({
"from": collection_name +"+"+ str(issue['number']),
"to": collection_name
})
cnt += 1
if not cnt % size:
with open('./json/relationship/belongto' + str(cnt // size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
if tmp:
with open('./json/relationship/belongto' + str(cnt // size+1) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
print(cnt)
return None
def get_mentionedin_closed_json(issue_set,pr_set,event_type,actor,rela,size):
tmp = []
cnt = 0
# 数据库链接
client = pymongo.MongoClient("mongodb://localhost:27017/")
# numpy及其fork
db = client['numpy_db']
for collection_name in db.list_collection_names():
if '_timeline' in collection_name:
name = collection_name[:collection_name.find('_timeline')]
collection = db[collection_name]
# 取出的事件满足条件:issue建立于2024年3月之前事件类型符合事件发生在2024年3月之前
events = collection.find({"issue_number": {"$in":list(issue_set[name])+list(pr_set[name])},
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
'event':event_type},{actor: 1,'issue_number':1,'created_at':1, '_id': 0})
#存在部分事件对象信息缺失
for event in events:
try:
tmp.append({
"from":event[actor]['id'],
"to":name +"+"+ str(event['issue_number']),
"time":event['created_at']
})
cnt += 1
if not cnt % size:
with open('./json/relationship/'+rela+ str(cnt // size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
except Exception as e:
pass
# numpy及其fork的上下游
with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
repo_list = set(ast.literal_eval(f.read()))
db = client['reference_timeline']
for collection_name in db.list_collection_names():
# 删除数据库中非上下游的仓库的信息
if collection_name.replace('_', '/', 1) not in repo_list:
continue
collection = db[collection_name]
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
'event': event_type}, {actor: 1,'issue_number':1,'created_at':1, '_id': 0})
for event in events:
try:
tmp.append({
"from": event[actor]['id'],
"to": collection_name +"+"+ str(event['issue_number']),
"time": event['created_at']
})
cnt += 1
if not cnt % size:
with open('./json/relationship/'+rela + str(cnt // size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
except Exception as e:
pass
# 上下游的fork
db = client['fork_reference']
for collection_name in db.list_collection_names():
collection = db[collection_name]
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
'event': event_type}, {actor: 1,'issue_number':1,'created_at':1, '_id': 0})
for event in events:
try:
tmp.append({
"from": event[actor]['id'],
"to": collection_name +"+"+ str(event['issue_number']),
"time": event['created_at']
})
cnt += 1
if not cnt % size:
with open('./json/relationship/'+rela + str(cnt // size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
except Exception as e:
pass
if tmp:
with open('./json/relationship/'+rela + str(cnt // size+1) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
def get_opened_json(size):
# 初始化
cnt = 0
tmp = []
issue_set = collections.defaultdict(set)
# 数据库链接
client = pymongo.MongoClient("mongodb://localhost:27017/")
# numpy及其fork
db = client['numpy_db']
for collection_name in db.list_collection_names():
if 'issue&pr' in collection_name:
name = collection_name[:collection_name.find('issue&pr') - 1]
collection = db[collection_name]
issues = collection.find(
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}},
{'number': 1, 'user': 1,'url':1,'created_at':1, '_id': 0})
for issue in issues:
# 筛掉重复的issue&pr
if issue['number'] in issue_set[name]:
continue
issue_set[name].add(issue['number'])
tmp.append({
"from":issue['user']['id'],
"to":name +"+"+ str(issue['number']),
"time":issue['created_at']
})
cnt += 1
if not cnt%size:
with open('./json/relationship/opened'+str(cnt//size)+'.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
# numpy及其fork的上下游
with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
repo_list = set(ast.literal_eval(f.read()))
db = client['reference_db']
for collection_name in db.list_collection_names():
# 删除数据库中非上下游的仓库的信息
if collection_name.replace('_', '/', 1) not in repo_list:
continue
collection = db[collection_name]
issues = collection.find(
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}},
{'number': 1, 'user': 1,'url':1,'created_at':1, '_id': 0})
for issue in issues:
if issue['number'] in issue_set[collection_name]:
continue
issue_set[collection_name].add(issue['number'])
tmp.append({
"from": issue['user']['id'],
"to": collection_name +"+"+ str(issue['number']),
"time": issue['created_at']
})
cnt += 1
if not cnt % size:
with open('./json/relationship/opened' + str(cnt // size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
# 上下游的fork
db = client['fork_db']
for collection_name in db.list_collection_names():
collection = db[collection_name]
issues = collection.find(
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}},
{'number': 1, 'user': 1,'url':1,'created_at':1, '_id': 0})
for issue in issues:
if issue['number'] in issue_set[collection_name]:
continue
issue_set[collection_name].add(issue['number'])
tmp.append({
"from": issue['user']['id'],
"to": collection_name +"+"+ str(issue['number']),
"time": issue['created_at']
})
cnt += 1
if not cnt % size:
with open('./json/relationship/opened' + str(cnt // size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
if tmp:
with open('./json/relationship/opened' + str(cnt // size+1) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
print(cnt)
return None
def get_commented_json(issue_set,pr_set,size):
tmp = []
cnt = 0
# 数据库链接
client = pymongo.MongoClient("mongodb://localhost:27017/")
# numpy及其fork
db = client['numpy_db']
for collection_name in db.list_collection_names():
if '_timeline' in collection_name:
name = collection_name[:collection_name.find('_timeline')]
collection = db[collection_name]
# 取出的事件满足条件:issue建立于2024年3月之前事件类型符合事件发生在2024年3月之前
events = collection.find({"issue_number": {"$in":list(issue_set[name])+list(pr_set[name])},
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
'event':'commented'},{'user': 1,'issue_number':1,'created_at':1,'body':1, 'author_association':1, '_id': 0})
#存在部分事件对象信息缺失
for event in events:
try:
tmp.append({
"from":event['user']['id'],
"to":name +"+"+ str(event['issue_number']),
"time":event['created_at'],
"body":event['body'],
"author_association":event['author_association']
})
cnt += 1
if not cnt % size:
with open('./json/relationship/'+'commented'+ str(cnt // size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
except Exception as e:
pass
# numpy及其fork的上下游
with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
repo_list = set(ast.literal_eval(f.read()))
db = client['reference_timeline']
for collection_name in db.list_collection_names():
# 删除数据库中非上下游的仓库的信息
if collection_name.replace('_', '/', 1) not in repo_list:
continue
collection = db[collection_name]
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
'event': 'commented'}, {'user': 1,'issue_number':1,'created_at':1,'body':1, 'author_association':1, '_id': 0})
for event in events:
try:
tmp.append({
"from": event['user']['id'],
"to": collection_name +"+"+ str(event['issue_number']),
"time": event['created_at'],
"body":event['body'],
"author_association":event['author_association']
})
cnt += 1
if not cnt % size:
with open('./json/relationship/'+'commented' + str(cnt // size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
except Exception as e:
pass
# 上下游的fork
db = client['fork_reference']
for collection_name in db.list_collection_names():
collection = db[collection_name]
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
'event': 'commented'}, {'user': 1,'issue_number':1,'created_at':1,'body':1, 'author_association':1, '_id': 0})
for event in events:
try:
tmp.append({
"from": event['user']['id'],
"to": collection_name +"+"+ str(event['issue_number']),
"time": event['created_at'],
"body":event['body'],
"author_association":event['author_association']
})
cnt += 1
if not cnt % size:
with open('./json/relationship/'+'commented' + str(cnt // size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
except Exception as e:
pass
if tmp:
with open('./json/relationship/'+'commented' + str(cnt // size+1) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
print(cnt,'commented')
def get_link_depend_json(issue_set,pr_set,size):
tmp = []
cnt = 0
repo_rely = collections.defaultdict(set)
all_repos = {i for i in issue_set.keys() if issue_set[i]}.union({i for i in pr_set.keys() if pr_set[i]})
# 数据库链接
client = pymongo.MongoClient("mongodb://localhost:27017/")
# numpy及其fork
db = client['numpy_db']
for collection_name in db.list_collection_names():
if '_timeline' in collection_name:
name = collection_name[:collection_name.find('_timeline')]
collection = db[collection_name]
# 取出的事件满足条件:issue建立于2024年3月之前事件类型符合事件发生在2024年3月之前
events = collection.find({"issue_number": {"$in":list(issue_set[name])+list(pr_set[name])},
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
'event':'cross-referenced'},{'source': 1,'issue_number':1,'created_at':1,'actor':1,'_id': 0})
#存在部分事件对象信息缺失
for event in events:
try:
repo = event['source']['issue']['repository_url'][event['source']['issue']['repository_url'].rfind('/',0,event['source']['issue']['repository_url'].rfind('/'))+1:].replace('/','_')
#筛掉repo节点外的关系
if repo not in all_repos:
continue
tmp.append({
"from":repo+"+"+ str(event['source']['issue']['number']),
"to":name +"+"+ str(event['issue_number']),
"actor":event['actor']['id'],
"time":event['created_at']
})
repo_rely[repo].add(name)
cnt += 1
if not cnt % size:
with open('./json/relationship/'+'link'+ str(cnt // size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
except Exception as e:
pass
# numpy及其fork的上下游
with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
repo_list = set(ast.literal_eval(f.read()))
db = client['reference_timeline']
for collection_name in db.list_collection_names():
# 删除数据库中非上下游的仓库的信息
if collection_name.replace('_', '/', 1) not in repo_list:
continue
collection = db[collection_name]
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
'event': 'cross-referenced'}, {'source': 1,'issue_number':1,'created_at':1,'actor':1,'_id': 0})
for event in events:
try:
repo = event['source']['issue']['repository_url'].rfind('/', 0, event['source']['issue'][
'repository_url'].rfind('/')).replace('/', '_')
if repo not in all_repos:
continue
tmp.append({
"from": repo +"+"+ str(event['source']['issue']['number']),
"to": collection_name +"+"+ str(event['issue_number']),
"actor": event['actor']['id'],
"time": event['created_at']
})
repo_rely[repo].add(collection_name)
cnt += 1
if not cnt % size:
with open('./json/relationship/' + 'link' + str(cnt // size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
except Exception as e:
pass
# 上下游的fork
db = client['fork_reference']
for collection_name in db.list_collection_names():
collection = db[collection_name]
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
'event': 'cross-referenced'}, {'source': 1,'issue_number':1,'created_at':1,'actor':1,'_id': 0})
for event in events:
try:
repo = event['source']['issue']['repository_url'].rfind('/', 0, event['source']['issue'][
'repository_url'].rfind('/')).replace('/', '_')
if repo not in all_repos:
continue
tmp.append({
"from": repo +"+"+ str(event['source']['issue']['number']),
"to": collection_name +"+"+ str(event['issue_number']),
"actor": event['actor']['id'],
"time": event['created_at']
})
repo_rely[repo].add(collection_name)
cnt += 1
if not cnt % size:
with open('./json/relationship/' + 'link' + str(cnt // size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
except Exception as e:
pass
if tmp:
with open('./json/relationship/'+'link' + str(cnt // size+1) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
print(cnt,'link')
#depend
depend = []
for repo in repo_rely.keys():
for linked_repo in repo_rely[repo]:
depend.append({
"from":repo,
"to":linked_repo
})
if not len(depend) % size:
with open('./json/relationship/' + 'depend' + str(len(depend) // size) + '.json', 'w') as f:
json.dump(depend, f, indent=4)
depend = []
if depend:
with open('./json/relationship/' + 'depend' + str(len(depend) // size + 1) + '.json', 'w') as f:
json.dump(depend, f, indent=4)
print(len(depend),'depend')
def get_labeled_json(issue_set,pr_set,size):
tmp = []
cnt = 0
# 数据库链接
client = pymongo.MongoClient("mongodb://localhost:27017/")
# numpy及其fork
db = client['numpy_db']
for collection_name in db.list_collection_names():
if '_timeline' in collection_name:
name = collection_name[:collection_name.find('_timeline')]
collection = db[collection_name]
# 取出的事件满足条件:issue建立于2024年3月之前事件类型符合事件发生在2024年3月之前
events = collection.find({"issue_number": {"$in":list(issue_set[name])+list(pr_set[name])},
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
'event':'labeled'},{'actor': 1,'issue_number':1,'created_at':1, 'label':1, '_id': 0})
#存在部分事件对象信息缺失
for event in events:
try:
tmp.append({
"from":event['actor']['id'],
"to":name +"+"+ str(event['issue_number']),
"time":event['created_at'],
"label":event['label']['name']
})
cnt += 1
if not cnt % size:
with open('./json/relationship/'+'labeled'+ str(cnt // size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
except Exception as e:
pass
# numpy及其fork的上下游
with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
repo_list = set(ast.literal_eval(f.read()))
db = client['reference_timeline']
for collection_name in db.list_collection_names():
# 删除数据库中非上下游的仓库的信息
if collection_name.replace('_', '/', 1) not in repo_list:
continue
collection = db[collection_name]
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
'event':'labeled'},{'actor': 1,'issue_number':1,'created_at':1, 'label':1, '_id': 0})
for event in events:
try:
tmp.append({
"from": event['actor']['id'],
"to": collection_name +"+"+ str(event['issue_number']),
"time": event['created_at'],
"label": event['label']['name']
})
cnt += 1
if not cnt % size:
with open('./json/relationship/' + 'labeled' + str(cnt // size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
except Exception as e:
pass
# 上下游的fork
db = client['fork_reference']
for collection_name in db.list_collection_names():
collection = db[collection_name]
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
'event':'labeled'},{'actor': 1,'issue_number':1,'created_at':1, 'label':1, '_id': 0})
for event in events:
try:
tmp.append({
"from": event['actor']['id'],
"to": collection_name +"+"+ str(event['issue_number']),
"time": event['created_at'],
"label": event['label']['name']
})
cnt += 1
if not cnt % size:
with open('./json/relationship/' + 'labeled' + str(cnt // size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
except Exception as e:
pass
if tmp:
with open('./json/relationship/'+'labeled' + str(cnt // size+1) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
print(cnt,'labeled')
def get_assigned_json(issue_set,pr_set,size):
tmp = []
cnt = 0
# 数据库链接
client = pymongo.MongoClient("mongodb://localhost:27017/")
# numpy及其fork
db = client['numpy_db']
for collection_name in db.list_collection_names():
if '_timeline' in collection_name:
name = collection_name[:collection_name.find('_timeline')]
collection = db[collection_name]
# 取出的事件满足条件:issue建立于2024年3月之前事件类型符合事件发生在2024年3月之前
events = collection.find({"issue_number": {"$in":list(issue_set[name])+list(pr_set[name])},
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
'event':'assigned'},{'actor': 1,'issue_number':1,'created_at':1, 'assignee':1, '_id': 0})
#存在部分事件对象信息缺失
for event in events:
try:
tmp.append({
"assigner":event['actor']['id'],
"assignee":event['assignee']['id'],
"time":event['created_at'],
"issue":name +"+"+ str(event['issue_number'])
})
cnt += 1
if not cnt % size:
with open('./json/relationship/'+'assigned'+ str(cnt //size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
except Exception as e:
pass
# numpy及其fork的上下游
with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
repo_list = set(ast.literal_eval(f.read()))
db = client['reference_timeline']
for collection_name in db.list_collection_names():
# 删除数据库中非上下游的仓库的信息
if collection_name.replace('_', '/', 1) not in repo_list:
continue
collection = db[collection_name]
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
'event':'assigned'},{'actor': 1,'issue_number':1,'created_at':1, 'assignee':1, '_id': 0})
for event in events:
try:
tmp.append({
"assigner": event['actor']['id'],
"assignee": event['assignee']['id'],
"time": event['created_at'],
"issue": collection_name +"+"+ str(event['issue_number'])
})
cnt += 1
if not cnt % size:
with open('./json/relationship/' + 'assigned' + str(cnt // size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
except Exception as e:
pass
# 上下游的fork
db = client['fork_reference']
for collection_name in db.list_collection_names():
collection = db[collection_name]
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
'event':'assigned'},{'actor': 1,'issue_number':1,'created_at':1, 'assignee':1, '_id': 0})
for event in events:
try:
tmp.append({
"assigner": event['actor']['id'],
"assignee": event['assignee']['id'],
"time": event['created_at'],
"issue": collection_name +"+"+ str(event['issue_number'])
})
cnt += 1
if not cnt % size:
with open('./json/relationship/' + 'assigned' + str(cnt // size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
except Exception as e:
pass
if tmp:
with open('./json/relationship/'+'assigned' + str(cnt // size+1) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
print(cnt,'assigned')
def split_file():
with open('./json/relationship/commented161.json','r',encoding='utf-8') as f1:
data = json.load(f1)
with open('./json/relationship/commented400.json', 'w') as f2:
json.dump(data[0:27000], f2, indent=4)
with open('./json/relationship/commented401.json', 'w') as f3:
json.dump(data[27000:], f3, indent=4)
def test(issue_set,pr_set,size):
cnt = 0
repo_rely = collections.defaultdict(set)
all_repos = {i for i in issue_set.keys() if issue_set[i]}.union({i for i in pr_set.keys() if pr_set[i]})
# 数据库链接
client = pymongo.MongoClient("mongodb://localhost:27017/")
# numpy及其fork
db = client['numpy_db']
for collection_name in db.list_collection_names():
if '_timeline' in collection_name:
name = collection_name[:collection_name.find('_timeline')]
collection = db[collection_name]
# 取出的事件满足条件:issue建立于2024年3月之前事件类型符合事件发生在2024年3月之前
events = collection.find({"issue_number": {"$in":list(issue_set[name])+list(pr_set[name])},
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
'event':'cross-referenced'},{'source': 1,'issue_number':1,'created_at':1,'actor':1,'_id': 0})
#存在部分事件对象信息缺失
for event in events:
try:
repo = event['source']['issue']['repository_url'][event['source']['issue']['repository_url'].rfind('/',0,event['source']['issue']['repository_url'].rfind('/'))+1:].replace('/','_')
#筛掉repo节点外的关系
if repo not in all_repos:
continue
repo_rely[repo].add(name)
if repo == 'bioconda_bioconda-recipes':
print({
"from":repo+"+"+ str(event['source']['issue']['number']),
"to":name +"+"+ str(event['issue_number']),
"actor":event['actor']['id'],
"time":event['created_at']
})
print(cnt)
cnt += 1
except Exception as e:
pass
print(cnt)
if __name__ == "__main__":
size = 200000
# get_fork_json()
# get_belongto_json(size)
issue_set,pr_set,user_set = get_issues_and_users()
# get_mentionedin_closed_json(issue_set,pr_set,'mentioned','actor','mentionedin',size)
# get_mentionedin_closed_json(issue_set,pr_set,'closed','actor','closed',size)
# get_opened_json(size)
# get_commented_json(issue_set, pr_set,50000)
# get_link_depend_json(issue_set, pr_set,size)
# get_labeled_json(issue_set, pr_set,size)
# get_assigned_json(issue_set, pr_set,size)
# split_file()
test(issue_set, pr_set, size)