github_mongoDB_dataprocess/github_to_mongodb.py

89 lines
3.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from github import Github
import pymongo
import time
import ast
from datetime import datetime
from fork_analysis import find_fork_names
def get_timeline_event(repository_name,db,clt,access_token):
# 连接到github
g = Github(access_token)
repository = g.get_repo(repository_name)
# 连接到MongoDB数据库
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client[db]
collection = db[clt]
# 获取repository的issues
issues = repository.get_issues(state='all')
print(issues.totalCount)
# print(repository.get_events().totalCount)
for issue in issues:
# if issue.pull_request:
# continue
try:
timeline_events = issue.get_timeline()
for event in timeline_events:
raw_data = event.raw_data
raw_data['issue_number'] = issue.number
collection.insert_one(raw_data)
except Exception as e:
with open('get_numpy_timeline_err.txt', 'a', encoding='utf-8', newline='') as f:
f.write(str(repository_name)+' '+str(issue.number) + f"发生的错误类型是:{type(e)}" + f"错误信息是:{e}")
now = datetime.now()
print("当前时间是:", now.strftime("%Y-%m-%d %H:%M:%S"))
def get_issue_and_pr(repository_name, db, clt,access_token):
# 连接到github
g = Github(access_token)
repository = g.get_repo(repository_name)
# 连接到MongoDB数据库
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client[db]
collection = db[clt]
# 获取repository的issues
issues = repository.get_issues(state='all')
print(issues.totalCount)
# 遍历每个issue并存储_rawdata到MongoDB
for issue in issues:
# if issue.pull_request:
# continue
try:
raw_data = issue.raw_data
collection.insert_one(raw_data)
except Exception as e:
with open('get_numpy_err.txt','a',encoding='utf-8',newline='') as f:
f.write(str(issue.number)+f"发生的错误类型是:{type(e)}"+f"错误信息是:{e}")
now = datetime.now()
print("当前时间是:", now.strftime("%Y-%m-%d %H:%M:%S"))
token = "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"
'''
#numpy
get_timeline_event('numpy/numpy', 'numpy_db', 'issue_timeline_new',token)
get_issue_and_pr('numpy/numpy', 'numpy_db', 'issue_pr',token)
#numpy的fork
repo_list = ['seberg/numpy', 'hpyproject/numpy-hpy', 'xman/numpy-posit', 'Mukulikaa/numpy', 'George-Bassilious/numpy', 'plctlab/numpy', 'lvcarlosja/numpy']
for repos in repo_list:
get_timeline_event(repos, 'numpy_db', str(repos).replace('/','_')+'_timeline',token)
get_issue_and_pr(repos, 'numpy_db', str(repos).replace('/','_')+'_issue&pr',token)
#numpy的上下游、numpy的fork的上下游 2048+1
'''
#numpy的上下游的fork 1587numpy的fork的上下游的fork数满足条件者为0
repo_list = find_fork_names(token)
for repos in repo_list:
get_timeline_event(repos, 'fork_timeline', str(repos).replace('/','_'),token)
# get_issue_and_pr(repos, 'fork_db', str(repos).replace('/','_'),token)
#添加模块存储已经下载的仓库的id新加入的仓库判断id是否已经存在存在