github_mongoDB_dataprocess/get_fork_repos.py

69 lines
2.4 KiB
Python

from github import Github
from datetime import datetime
import pandas as pd
import ast
import random
def get_name(access_token,repo):
g = Github(access_token)
repository = g.get_repo(repo)
forks = repository.get_forks()
print(forks.totalCount)
repo_dict = dict()
for fork in forks:
try:
since_time = datetime.strptime(str(fork.created_at), "%Y-%m-%d %H:%M:%S%z")
issues = g.get_repo(fork.full_name).get_issues(since=since_time)
repo_dict[fork.full_name] = issues.totalCount
#筛选在截至时间2024-03-01之间创建的general issue
for issue in issues:
if issue.pull_request:
repo_dict[fork.full_name] -= 1
elif datetime.strptime(str(issue.created_at), '%Y-%m-%dT%H:%M:%SZ') > datetime(2024, 3, 1, 0, 0, 0):
repo_dict[fork.full_name] -= 1
print(issues.totalCount,end=' ')
except Exception as e:
with open('./errs/numpy_fork_repo_issues_err.txt','w',encoding='utf-8', newline='') as f:
f.write(str(fork.full_name) + f"发生的错误类型是:{type(e)}" + f"错误信息是:{e}")
repo_dict = {k: v for k, v in sorted(repo_dict.items(), key=lambda item: item[1], reverse=True)}
print(repo_dict)
df = pd.DataFrame(list(repo_dict.items()), columns=['repo_name', 'number'])
df.to_excel('./fork_issue_number/'+repo.replace('/','_')+'_fork_repo_issues.xlsx', index=False)
now = datetime.now()
print("当前时间是:", now.strftime("%Y-%m-%d %H:%M:%S"))
return 0
def get_data(access_token, repo,threshold):
df = pd.read_excel('./fork_issue_number/'+repo.replace('/','_')+'_fork_repo_issues.xlsx')
filtered_repos = df[df['number'] > threshold]['repo_name'].tolist()
# get_issue_timeline(repository_name, db, clt)
# access_token = "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"
# get_name(access_token,'numpy/numpy')
# l = ["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","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"]
# access_token = random.choice(l)
access_token = "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"
#0-400 466-
with open('numpy_refer_repo_name.txt','r',encoding='utf-8',newline='') as f:
repo_list = ast.literal_eval(f.read())
for repo in repo_list[389:400]:
print(repo_list.index(repo), end=' ')
print(repo, end=' ')
get_name(access_token, repo)