release_analysis/spider.py

94 lines
3.8 KiB
Python

import csv
from datetime import datetime
from github import *
import pandas as pd
import numpy as np
import time
g = Github("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")
# Check for repos with more than 1000 stars sorted by forks
repos = g.search_repositories(query='stars:>3000',sort='forks',order='desc')
repo_info = []
count = 0
for repo in repos:
# Get repo information
name = repo.name.replace("/","")
owner = repo.owner.login
created_at = repo.created_at
repo_lang = repo.language
"""milestones = repo.get_milestone()
mile_num = milestones.number()"""
# Get releases and save information
releases = repo.get_releases()
release_num = releases.totalCount
commits = repo.get_commits()
commit_data = []
commit_num=commits.totalCount
"""删除少于2个版本的库"""
if release_num<=1:
continue
if commit_num <= 100:
continue
count+=1
print(count,' : ',name)
if count <51:
continue
repo_info.append([count, name, owner, created_at, repo_lang, release_num, commit_num])
df = pd.DataFrame(repo_info,
columns=['Count', 'Name', 'Owners', 'Repo_Created_At', 'Language', 'Release_Num', 'Commit_Num'])
df.to_csv(f'repo_data.csv')
#colums=['Count','Name','Owners','Repo_Created_At','Language','Release_Num']###'Milestones_Num'
release_data = []
for release in releases:
# Get release information
release_title = release.title
release_tag = release.tag_name
release_author = release.author.name
release_created_at = release.created_at #Type: datetime.datetime
release_sec = (release_created_at-created_at).total_seconds()
#release_body = release.body
release_data.append([release_title, release_tag, release_author, release_created_at,release_sec])
#release_df = pd.DataFrame(release_data, columns=['Title', 'Tag', 'Author', 'Rel_Created_At', 'Body'])
with open(f'{name}_release_data.csv', 'w', newline='',encoding='utf-8') as file:
writer = csv.writer(file)
try:
writer.writerows(release_data)
except UnicodeEncodeError:
print(f"Error writing {name}_commit_data.csv: illegal multibyte sequence")
# Get commits and save information
count0=0
for commit in commits:
# Get commit information
commit_hash = commit.sha
#commit_author = commit.commit.author.name
commit_created_at = commit.commit.author.date #such as: 2018-10-11 03:04:52
commit_sec = (commit_created_at-created_at).total_seconds()
print(commit_sec)
#commit_message = commit.commit.message #或可应用nlp分析commit和release之间的关系
commit_data.append([commit_hash, commit_created_at,commit_sec])
"""df = pd.DataFrame(commit_data, columns=['commit_hash', 'commit_author', 'commit_created_at', 'commit_message'])
df.to_csv(f'{name}_commit_data.csv', index=False, encoding='utf-8')"""
count0+=1
if count0>1000:
time.sleep(10)
count0=0
#commit_df = pd.DataFrame(commit_data, columns=['Hash', 'Author', 'Com_Created_At', 'Message'])
with open(f'{name}_commit_data.csv', 'w', newline='',encoding='utf-8') as file:
writer = csv.writer(file)
try:
writer.writerows(commit_data)
except UnicodeEncodeError:
print(f"Error writing {name}_commit_data.csv: illegal multibyte sequence")
# Combine dataframes and save to file
#df = pd.concat([release_df, commit_df], keys=['Release', 'Commit'], names=['Type', 'Index'])
#df.to_csv(f'{name}_data.csv')
"""with open(f'repo_data.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerows(repo_info)"""