94 lines
3.8 KiB
Python
94 lines
3.8 KiB
Python
import csv
|
|
from datetime import datetime
|
|
from github import *
|
|
import pandas as pd
|
|
import numpy as np
|
|
import time
|
|
|
|
g = Github("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")
|
|
|
|
# Check for repos with more than 1000 stars sorted by forks
|
|
repos = g.search_repositories(query='stars:>3000',sort='forks',order='desc')
|
|
repo_info = []
|
|
count = 0
|
|
for repo in repos:
|
|
# Get repo information
|
|
name = repo.name.replace("/","")
|
|
owner = repo.owner.login
|
|
created_at = repo.created_at
|
|
repo_lang = repo.language
|
|
"""milestones = repo.get_milestone()
|
|
mile_num = milestones.number()"""
|
|
|
|
# Get releases and save information
|
|
releases = repo.get_releases()
|
|
release_num = releases.totalCount
|
|
|
|
commits = repo.get_commits()
|
|
commit_data = []
|
|
commit_num=commits.totalCount
|
|
"""删除少于2个版本的库"""
|
|
if release_num<=1:
|
|
continue
|
|
if commit_num <= 100:
|
|
continue
|
|
count+=1
|
|
print(count,' : ',name)
|
|
if count <51:
|
|
continue
|
|
repo_info.append([count, name, owner, created_at, repo_lang, release_num, commit_num])
|
|
df = pd.DataFrame(repo_info,
|
|
columns=['Count', 'Name', 'Owners', 'Repo_Created_At', 'Language', 'Release_Num', 'Commit_Num'])
|
|
df.to_csv(f'repo_data.csv')
|
|
#colums=['Count','Name','Owners','Repo_Created_At','Language','Release_Num']###'Milestones_Num'
|
|
|
|
|
|
release_data = []
|
|
for release in releases:
|
|
# Get release information
|
|
release_title = release.title
|
|
release_tag = release.tag_name
|
|
release_author = release.author.name
|
|
release_created_at = release.created_at #Type: datetime.datetime
|
|
release_sec = (release_created_at-created_at).total_seconds()
|
|
#release_body = release.body
|
|
release_data.append([release_title, release_tag, release_author, release_created_at,release_sec])
|
|
#release_df = pd.DataFrame(release_data, columns=['Title', 'Tag', 'Author', 'Rel_Created_At', 'Body'])
|
|
with open(f'{name}_release_data.csv', 'w', newline='',encoding='utf-8') as file:
|
|
writer = csv.writer(file)
|
|
try:
|
|
writer.writerows(release_data)
|
|
except UnicodeEncodeError:
|
|
print(f"Error writing {name}_commit_data.csv: illegal multibyte sequence")
|
|
|
|
# Get commits and save information
|
|
count0=0
|
|
for commit in commits:
|
|
# Get commit information
|
|
commit_hash = commit.sha
|
|
#commit_author = commit.commit.author.name
|
|
commit_created_at = commit.commit.author.date #such as: 2018-10-11 03:04:52
|
|
commit_sec = (commit_created_at-created_at).total_seconds()
|
|
print(commit_sec)
|
|
#commit_message = commit.commit.message #或可应用nlp分析commit和release之间的关系
|
|
commit_data.append([commit_hash, commit_created_at,commit_sec])
|
|
"""df = pd.DataFrame(commit_data, columns=['commit_hash', 'commit_author', 'commit_created_at', 'commit_message'])
|
|
df.to_csv(f'{name}_commit_data.csv', index=False, encoding='utf-8')"""
|
|
count0+=1
|
|
if count0>1000:
|
|
time.sleep(10)
|
|
count0=0
|
|
#commit_df = pd.DataFrame(commit_data, columns=['Hash', 'Author', 'Com_Created_At', 'Message'])
|
|
with open(f'{name}_commit_data.csv', 'w', newline='',encoding='utf-8') as file:
|
|
writer = csv.writer(file)
|
|
try:
|
|
writer.writerows(commit_data)
|
|
except UnicodeEncodeError:
|
|
print(f"Error writing {name}_commit_data.csv: illegal multibyte sequence")
|
|
|
|
# Combine dataframes and save to file
|
|
#df = pd.concat([release_df, commit_df], keys=['Release', 'Commit'], names=['Type', 'Index'])
|
|
#df.to_csv(f'{name}_data.csv')
|
|
"""with open(f'repo_data.csv', 'w', newline='') as file:
|
|
writer = csv.writer(file)
|
|
writer.writerows(repo_info)""" |