github_mongoDB_dataprocess/get_entity_json.py

482 lines
23 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#2264217 issue节点
#3276401 pr节点
#3582 repo节点
#953922 user节点
#统一timeling数据库和issue数据库内repo的名称(已完成)
#遍历issue数据库使用hash存储满足时间要求的issue的名称(已完成)
#确认关注哪些timeline行为选择哪个字段为开发者
#遍历timeline数据库找出参与上述issue的开发者名单使用hash存储
#存储关系信息
#导入neo4j
import pymongo
import collections
import ast
import json
from github import Github
def get_issues_and_users():
#初始化
issue_set = collections.defaultdict(set)
pr_set = collections.defaultdict(set)
user_set = dict()
#数据库链接
client = pymongo.MongoClient("mongodb://localhost:27017/")
#numpy及其fork
db = client['numpy_db']
for collection_name in db.list_collection_names():
if 'issue&pr' in collection_name:
name = collection_name[:collection_name.find('issue&pr')-1]
collection = db[collection_name]
# pr
issues = collection.find(
{"created_at": {"$lt": "2024-03-01T00:00:00Z"},"pull_request": {"$exists": True}},
{'number': 1,'user': 1, '_id': 0})
for issue in issues:
pr_set[name].add(issue['number'])
user_set[issue['user']['id']] = issue['user']['login']
# issue
issues = collection.find(
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}, "pull_request": {"$exists": False}},
{'number': 1, 'user': 1, '_id': 0})
for issue in issues:
issue_set[name].add(issue['number'])
user_set[issue['user']['id']] = issue['user']['login']
# numpy及其fork的上下游
with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
repo_list = set(ast.literal_eval(f.read()))
db = client['reference_db']
for collection_name in db.list_collection_names():
# 删除数据库中非上下游的仓库的信息
if collection_name.replace('_','/',1) not in repo_list:
continue
collection = db[collection_name]
issues = collection.find(
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}, "pull_request": {"$exists": True}},
{'number': 1, 'user': 1, '_id': 0})
for issue in issues:
pr_set[collection_name].add(issue['number'])
user_set[issue['user']['id']] = issue['user']['login']
issues = collection.find(
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}, "pull_request": {"$exists": False}},
{'number': 1, 'user': 1, '_id': 0})
for issue in issues:
issue_set[collection_name].add(issue['number'])
user_set[issue['user']['id']] = issue['user']['login']
# 上下游的fork
db = client['fork_db']
for collection_name in db.list_collection_names():
collection = db[collection_name]
issues = collection.find(
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}, "pull_request": {"$exists": True}},
{'number': 1, 'user': 1, '_id': 0})
for issue in issues:
pr_set[collection_name].add(issue['number'])
user_set[issue['user']['id']] = issue['user']['login']
issues = collection.find(
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}, "pull_request": {"$exists": False}},
{'number': 1, 'user': 1, '_id': 0})
for issue in issues:
issue_set[collection_name].add(issue['number'])
user_set[issue['user']['id']] = issue['user']['login']
print('issue finished')
return issue_set,pr_set,user_set
def get_all_users(issue_set,pr_set,user_set,event_type,user):
#mentioned actor
# 数据库链接
client = pymongo.MongoClient("mongodb://localhost:27017/")
# numpy及其fork
db = client['numpy_db']
for collection_name in db.list_collection_names():
if '_timeline' in collection_name:
name = collection_name[:collection_name.find('_timeline')]
collection = db[collection_name]
# 取出的事件满足条件:issue建立于2024年3月之前事件类型符合事件发生在2024年3月之前
events = collection.find({"issue_number": {"$in":list(issue_set[name])+list(pr_set[name])},
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
'event':event_type},{user: 1, '_id': 0})
#存在部分事件对象缺失
for event in events:
try:
user_set[event[user]['id']] = event[user]['login']
except Exception as e:
pass
# numpy及其fork的上下游
with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
repo_list = set(ast.literal_eval(f.read()))
db = client['reference_timeline']
for collection_name in db.list_collection_names():
# 删除数据库中非上下游的仓库的信息
if collection_name.replace('_', '/', 1) not in repo_list:
continue
collection = db[collection_name]
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
'event': event_type}, {user: 1, '_id': 0})
for event in events:
try:
user_set[event[user]['id']] = event[user]['login']
except Exception as e:
pass
# 上下游的fork
db = client['fork_reference']
for collection_name in db.list_collection_names():
collection = db[collection_name]
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
'event': event_type}, {user: 1, '_id': 0})
for event in events:
try:
user_set[event[user]['id']] = event[user]['login']
except Exception as e:
pass
print(event_type,'finished')
return user_set
def get_event_user_json(issue_set,pr_set,user_set,event_type,user,tmp,cnt,size):
#mentioned actor
# 数据库链接
client = pymongo.MongoClient("mongodb://localhost:27017/")
# numpy及其fork
db = client['numpy_db']
for collection_name in db.list_collection_names():
if '_timeline' in collection_name:
name = collection_name[:collection_name.find('_timeline')]
collection = db[collection_name]
# 取出的事件满足条件:issue建立于2024年3月之前事件类型符合事件发生在2024年3月之前
events = collection.find({"issue_number": {"$in":list(issue_set[name])+list(pr_set[name])},
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
'event':event_type},{user: 1, '_id': 0})
#存在部分事件对象缺失
for event in events:
try:
if event[user]['id'] in user_set:
cnt += 1
del user_set[event[user]['id']]
# tmp.append(event[user])
tmp.append({"login":event[user]['login'],"id":event[user]['id']})
if not cnt % size:
with open('./json/entity/user' + str(cnt // size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
except Exception as e:
pass
# numpy及其fork的上下游
with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
repo_list = set(ast.literal_eval(f.read()))
db = client['reference_timeline']
for collection_name in db.list_collection_names():
# 删除数据库中非上下游的仓库的信息
if collection_name.replace('_', '/', 1) not in repo_list:
continue
collection = db[collection_name]
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
'event': event_type}, {user: 1, '_id': 0})
for event in events:
try:
if event[user]['id'] in user_set:
cnt += 1
del user_set[event[user]['id']]
tmp.append({"login":event[user]['login'],"id":event[user]['id']})
if not cnt % size:
with open('./json/entity/user' + str(cnt // size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
except Exception as e:
pass
# 上下游的fork
db = client['fork_reference']
for collection_name in db.list_collection_names():
collection = db[collection_name]
events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
"created_at": {"$lt": "2024-03-01T00:00:00Z"},
'event': event_type}, {user: 1, '_id': 0})
for event in events:
try:
if event[user]['id'] in user_set:
cnt += 1
del user_set[event[user]['id']]
tmp.append({"login":event[user]['login'],"id":event[user]['id']})
if not cnt % size:
with open('./json/entity/user' + str(cnt // size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
except Exception as e:
pass
print(event_type,'finished')
return user_set,tmp,cnt
def get_user_json(issue_set,pr_set,size):
with open('user_id.txt', 'r', encoding='utf-8', newline='') as f:
user_set = ast.literal_eval(f.read())
cnt = 0
tmp = []
# 数据库链接
client = pymongo.MongoClient("mongodb://localhost:27017/")
# numpy及其fork
db = client['numpy_db']
for collection_name in db.list_collection_names():
if 'issue&pr' in collection_name:
collection = db[collection_name]
issues = collection.find(
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}},
{'number': 1, 'user': 1, '_id': 0})
for issue in issues:
if issue['user']['id'] in user_set:
cnt += 1
del user_set[issue['user']['id']]
tmp.append({"login":issue['user']['login'],"id":issue['user']['id']})
if not cnt % size:
with open('./json/entity/user'+str(cnt//size)+'.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
# numpy及其fork的上下游
with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
repo_list = set(ast.literal_eval(f.read()))
db = client['reference_db']
for collection_name in db.list_collection_names():
# 删除数据库中非上下游的仓库的信息
if collection_name.replace('_', '/', 1) not in repo_list:
continue
collection = db[collection_name]
issues = collection.find(
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}},
{'number': 1, 'user': 1, '_id': 0})
for issue in issues:
if issue['user']['id'] in user_set:
cnt += 1
del user_set[issue['user']['id']]
tmp.append({"login":issue['user']['login'],"id":issue['user']['id']})
if not cnt % size:
with open('./json/entity/user' + str(cnt // size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
# 上下游的fork
db = client['fork_db']
for collection_name in db.list_collection_names():
collection = db[collection_name]
issues = collection.find(
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}},
{'number': 1, 'user': 1, '_id': 0})
for issue in issues:
if issue['user']['id'] in user_set:
cnt += 1
del user_set[issue['user']['id']]
tmp.append({"login":issue['user']['login'],"id":issue['user']['id']})
if not cnt % size:
with open('./json/entity/user' + str(cnt // size) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
tmp = []
user_set,tmp,cnt = get_event_user_json(issue_set,pr_set,user_set, 'mentioned','actor',tmp,cnt,size)
user_set,tmp,cnt = get_event_user_json(issue_set,pr_set,user_set, 'labeled', 'actor',tmp,cnt,size)
user_set,tmp,cnt = get_event_user_json(issue_set,pr_set,user_set, 'assigned', 'actor',tmp,cnt,size)
user_set,tmp,cnt = get_event_user_json(issue_set,pr_set,user_set, 'assigned', 'assignee',tmp,cnt,size)
user_set,tmp,cnt = get_event_user_json(issue_set,pr_set,user_set, 'commented', 'user',tmp,cnt,size)
user_set,tmp,cnt = get_event_user_json(issue_set,pr_set,user_set, 'closed', 'actor',tmp,cnt,size)
if tmp:
with open('./json/entity/user' + str(cnt // size+1) + '.json', 'w') as f:
json.dump(tmp, f, indent=4)
if not len(user_set):
print('get user json finished')
else:
print('error occurred')
return None
def get_issue_pr_json(size):
#初始化
issue_cnt = 0
pr_cnt = 0
issue_tmp = []
pr_tmp = []
issue_set = collections.defaultdict(set)
pr_set = collections.defaultdict(set)
#数据库链接
client = pymongo.MongoClient("mongodb://localhost:27017/")
#numpy及其fork
db = client['numpy_db']
for collection_name in db.list_collection_names():
if 'issue&pr' in collection_name:
name = collection_name[:collection_name.find('issue&pr') - 1]
collection = db[collection_name]
# pr
issues = collection.find(
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}, "pull_request": {"$exists": True}},
{'user':0,'labels':0,'assignee':0,'assignees':0,'repository_url':0,'labels_url':0,'comments_url':0,'events_url':0,
'timeline_url':0,'html_url':0,'id':0,'node_id':0,'comments':0,'performed_via_github_app':0,'closed_by':0,'_id': 0})
for issue in issues:
#筛掉重复的issuepr
if issue['number'] in pr_set[name]:
continue
pr_set[name].add(issue['number'])
pr_cnt += 1
issue["name"] = name+"+"+str(issue['number'])
issue['reactions']['like'] = issue['reactions'].pop('+1')
issue['reactions']['dislike'] = issue['reactions'].pop('-1')
issue['reactions'].pop('url')
pr_tmp.append(issue)
if not pr_cnt % size:
with open('./json/entity/pr' + str(pr_cnt // size) + '.json', 'w') as f:
json.dump(pr_tmp, f, indent=4)
pr_tmp = []
# issue
issues = collection.find(
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}, "pull_request": {"$exists": False}},
{'user':0,'labels':0,'assignee':0,'assignees':0,'repository_url':0,'labels_url':0,'comments_url':0,'events_url':0,
'timeline_url':0,'html_url':0,'id':0,'node_id':0,'comments':0,'performed_via_github_app':0,'closed_by':0,'_id': 0})
for issue in issues:
if issue['number'] in issue_set[name]:
continue
issue_set[name].add(issue['number'])
issue_cnt += 1
issue["name"] = name+"+"+str(issue['number'])
issue['reactions']['like'] = issue['reactions'].pop('+1')
issue['reactions']['dislike'] = issue['reactions'].pop('-1')
issue['reactions'].pop('url')
issue_tmp.append(issue)
if not issue_cnt % size:
with open('./json/entity/issue' + str(issue_cnt // size) + '.json', 'w') as f:
json.dump(issue_tmp, f, indent=4)
issue_tmp = []
# numpy及其fork的上下游
with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
repo_list = set(ast.literal_eval(f.read()))
db = client['reference_db']
for collection_name in db.list_collection_names():
# 删除数据库中非上下游的仓库的信息
if collection_name.replace('_','/',1) not in repo_list:
continue
collection = db[collection_name]
issues = collection.find(
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}, "pull_request": {"$exists": True}},
{'user':0,'labels':0,'assignee':0,'assignees':0,'repository_url':0,'labels_url':0,'comments_url':0,'events_url':0,
'timeline_url':0,'html_url':0,'id':0,'node_id':0,'comments':0,'performed_via_github_app':0,'closed_by':0,'_id': 0})
for issue in issues:
if issue['number'] in pr_set[collection_name]:
continue
pr_set[collection_name].add(issue['number'])
pr_cnt += 1
issue["name"] = collection_name+"+"+str(issue['number'])
issue['reactions']['like'] = issue['reactions'].pop('+1')
issue['reactions']['dislike'] = issue['reactions'].pop('-1')
issue['reactions'].pop('url')
pr_tmp.append(issue)
if not pr_cnt % size:
with open('./json/entity/pr' + str(pr_cnt // size) + '.json', 'w') as f:
json.dump(pr_tmp, f, indent=4)
pr_tmp = []
issues = collection.find(
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}, "pull_request": {"$exists": False}},
{'user':0,'labels':0,'assignee':0,'assignees':0,'repository_url':0,'labels_url':0,'comments_url':0,'events_url':0,
'timeline_url':0,'html_url':0,'id':0,'node_id':0,'comments':0,'performed_via_github_app':0,'closed_by':0,'_id': 0})
for issue in issues:
if issue['number'] in issue_set[collection_name]:
continue
issue_set[collection_name].add(issue['number'])
issue_cnt += 1
issue["name"] = collection_name+"+"+str(issue['number'])
issue['reactions']['like'] = issue['reactions'].pop('+1')
issue['reactions']['dislike'] = issue['reactions'].pop('-1')
issue['reactions'].pop('url')
issue_tmp.append(issue)
if not issue_cnt % size:
with open('./json/entity/issue' + str(issue_cnt // size) + '.json', 'w') as f:
json.dump(issue_tmp, f, indent=4)
issue_tmp = []
# 上下游的fork
db = client['fork_db']
for collection_name in db.list_collection_names():
collection = db[collection_name]
issues = collection.find(
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}, "pull_request": {"$exists": True}},
{'user':0,'labels':0,'assignee':0,'assignees':0,'repository_url':0,'labels_url':0,'comments_url':0,'events_url':0,
'timeline_url':0,'html_url':0,'id':0,'node_id':0,'comments':0,'performed_via_github_app':0,'closed_by':0,'_id': 0})
for issue in issues:
if issue['number'] in pr_set[collection_name]:
continue
pr_set[collection_name].add(issue['number'])
pr_cnt += 1
issue["name"] = collection_name+"+"+str(issue['number'])
issue['reactions']['like'] = issue['reactions'].pop('+1')
issue['reactions']['dislike'] = issue['reactions'].pop('-1')
issue['reactions'].pop('url')
pr_tmp.append(issue)
if not pr_cnt % size:
with open('./json/entity/pr' + str(pr_cnt // size) + '.json', 'w') as f:
json.dump(pr_tmp, f, indent=4)
pr_tmp = []
issues = collection.find(
{"created_at": {"$lt": "2024-03-01T00:00:00Z"}, "pull_request": {"$exists": False}},
{'user':0,'labels':0,'assignee':0,'assignees':0,'repository_url':0,'labels_url':0,'comments_url':0,'events_url':0,
'timeline_url':0,'html_url':0,'id':0,'node_id':0,'comments':0,'performed_via_github_app':0,'closed_by':0,'_id': 0})
for issue in issues:
if issue['number'] in issue_set[collection_name]:
continue
issue_set[collection_name].add(issue['number'])
issue_cnt += 1
issue["name"] = collection_name+"+"+str(issue['number'])
issue['reactions']['like'] = issue['reactions'].pop('+1')
issue['reactions']['dislike'] = issue['reactions'].pop('-1')
issue['reactions'].pop('url')
issue_tmp.append(issue)
if not issue_cnt % size:
with open('./json/entity/issue' + str(issue_cnt // size) + '.json', 'w') as f:
json.dump(issue_tmp, f, indent=4)
issue_tmp = []
if issue_tmp:
with open('./json/entity/issue' + str(issue_cnt // size+1) + '.json', 'w') as f:
json.dump(issue_tmp, f, indent=4)
if pr_tmp:
with open('./json/entity/pr' + str(pr_cnt // size+1) + '.json', 'w') as f:
json.dump(pr_tmp, f, indent=4)
print(issue_cnt,pr_cnt)
print('get issue and pr json finished')
return None
def get_repo_json(issue_set,pr_set):
repo_set = {i for i in issue_set.keys() if issue_set[i]}.union({i for i in pr_set.keys() if pr_set[i]})
tmp = []
for i in repo_set:
tmp.append({"name":i})
with open('./json/entity/repo.json', 'w') as f:
json.dump(tmp, f, indent=4)
return None
if __name__ == "__main__":
# issue_set,pr_set,user_set = get_issues_and_users()
# user_set = get_all_users(issue_set,pr_set,user_set,'mentioned','actor')
# user_set = get_all_users(issue_set,pr_set,user_set,'labeled','actor')
# user_set = get_all_users(issue_set,pr_set,user_set,'assigned','actor')
# user_set = get_all_users(issue_set,pr_set,user_set,'assigned','assignee')
# user_set = get_all_users(issue_set,pr_set,user_set,'commented','user')
# user_set = get_all_users(issue_set,pr_set,user_set,'closed','actor')
#
# with open('user_id.txt', 'w', encoding='utf-8', newline='') as f:
# f.write(str(user_set))
# repo_num = len({i for i in issue_set.keys() if issue_set[i]}.union({i for i in pr_set.keys() if pr_set[i]}))
# issue_num = sum(len(value) for value in issue_set.values())
# pr_num = sum(len(value) for value in pr_set.values())
# user_num = len(user_set)
# print(repo_num,issue_num,pr_num,user_num)
# get_user_json(issue_set,pr_set,200000)
get_issue_pr_json(25000)
# get_repo_json(issue_set,pr_set)