232 lines
5.9 KiB
Python
232 lines
5.9 KiB
Python
# coding: utf-8
|
||
# 采集repo的issue;支持更新
|
||
import MySQLdb
|
||
import json
|
||
import time
|
||
import random
|
||
import urllib2
|
||
|
||
|
||
import Queue
|
||
import threading
|
||
PRS = Queue.Queue()
|
||
|
||
|
||
import dbop
|
||
dbop.init_pool(3)
|
||
|
||
|
||
import Queue
|
||
TOKEN_POOL = Queue.Queue()
|
||
with open(".config.json") as fp:
|
||
config = json.load(fp)
|
||
tokens = config["tokens"]
|
||
for token in tokens:
|
||
TOKEN_POOL.put(token)
|
||
|
||
def get_token():
|
||
if TOKEN_POOL.empty():
|
||
return None
|
||
token = TOKEN_POOL.get()
|
||
# !!!!应该判断该token是否还有访问机会,否则再放回池子里
|
||
return token
|
||
|
||
def push_token(token):
|
||
TOKEN_POOL.put(token)
|
||
|
||
def getLongestSubstring(str1,str2):
|
||
longest=0
|
||
start_pos1=-1
|
||
start_pos2=-1
|
||
compares=0 # 记录比较次数
|
||
|
||
for i in range(len(str1)):
|
||
for j in range(len(str2)):
|
||
length=0
|
||
m=i
|
||
n=j
|
||
while str1[m]==str2[n]:
|
||
compares+=1
|
||
length+=1
|
||
m+=1
|
||
n+=1
|
||
if (m>=len(str1))|(n>=len(str2)):
|
||
break
|
||
if longest<length:
|
||
compares+=1
|
||
longest=length
|
||
start_pos1=i
|
||
start_pos2=j
|
||
return longest
|
||
|
||
repo_maps = {}
|
||
with open("../prj_par.txt","r")as fp:
|
||
for line in fp.readlines():
|
||
ps = line.split("\t")
|
||
repo_maps[int(ps[0])] = int(ps[1])
|
||
import urllib2
|
||
cmt_TEMPLATE = "https://api.github.com/repos/%s/%s/commits/%s"
|
||
send_headers = {"Content-Type":"application/json","Authorization":None}
|
||
def _get_url(url,retry_times=3):
|
||
|
||
while True:
|
||
token = get_token()
|
||
if token is None:
|
||
continue
|
||
send_headers["Authorization"] = "token %s"%token
|
||
push_token(token)
|
||
break
|
||
|
||
print "\t%s: "%threading.current_thread().name, url,token
|
||
|
||
req = urllib2.Request(url,headers = send_headers)
|
||
try:
|
||
error_msg, result = None,None
|
||
result = urllib2.urlopen(req,timeout=20)
|
||
raw_data = result.read().decode('utf-8')
|
||
except urllib2.HTTPError, e:
|
||
error_msg = e.code
|
||
except urllib2.URLError, e:
|
||
error_msg = e.reason
|
||
except Exception,e:
|
||
error_msg = e.message
|
||
|
||
if error_msg != None:
|
||
print error_msg
|
||
if retry_times == 0:
|
||
return None, result
|
||
else:
|
||
time.sleep(3*(4-retry_times))
|
||
return _get_url(url,retry_times-1)
|
||
return raw_data,result
|
||
|
||
|
||
def mutl_cmt_authors(repo_id, pr_num):
|
||
pr_cmts = dbop.select_one("select raw_json from commit_raw_json where repo_id=%s and pr_num=%s",(repo_maps[repo_id], pr_num))
|
||
if pr_cmts is None or pr_cmts[0] is None:
|
||
print "None commits"
|
||
return False
|
||
pr_cmts = json.loads(pr_cmts[0])
|
||
authors = set()
|
||
for prc in pr_cmts:
|
||
cmt_author = prc["commit"]["author"]["name"].lower()
|
||
authors.add(cmt_author)
|
||
if len(authors) >1:
|
||
return True
|
||
return False
|
||
|
||
def fetchThread():
|
||
global PRS
|
||
print "\t%s: starts to work"%( threading.current_thread().name)
|
||
while True:
|
||
try:
|
||
pr = PRS.get(block=False)
|
||
print ("\t%s: fetch"%( threading.current_thread().name)),pr
|
||
except Exception,e:
|
||
print e, ("\t%s: no more PRS"%( threading.current_thread().name))
|
||
break
|
||
pr_real_commit(pr)
|
||
|
||
|
||
def get_commit(sha,un, rn):
|
||
result = dbop.select_one("select sha from real_cmts where sha=%s",(sha,))
|
||
if result is not None:
|
||
print "already sha"
|
||
return
|
||
|
||
while True:
|
||
url = cmt_TEMPLATE%(un,rn,sha)
|
||
print url
|
||
ets, result = _get_url(url)
|
||
if ets is None:
|
||
print "返回none"
|
||
continue
|
||
dbop.execute("insert into real_cmts (sha,info) values(%s,%s)", (sha, ets))
|
||
break
|
||
|
||
def pr_real_commit(pr):
|
||
prj_id, pr_num, un, rn, pr_author = pr
|
||
|
||
pr_author = pr_author.lower()
|
||
pr_author_name = dbop.select_one("select user_name from allrepo_users where name=%s",(pr_author,))
|
||
if pr_author_name is not None:
|
||
pr_author_name = pr_author_name[0]
|
||
if pr_author_name is not None:
|
||
pr_author_name = pr_author_name.lower()
|
||
|
||
|
||
pr_cmts = dbop.select_one("select raw_json from commit_raw_json where repo_id=%s and pr_num=%s",(repo_maps[prj_id], pr_num))
|
||
if pr_cmts is None or pr_cmts[0] is None:
|
||
pass
|
||
# print "None commits"
|
||
pr_cmts = json.loads(pr_cmts[0])
|
||
# print "total commits:", len(pr_cmts)
|
||
authors = set()
|
||
real_authors = set()
|
||
cmts = []
|
||
for prc in pr_cmts:
|
||
cmt_author = prc["commit"]["author"]["name"].lower()
|
||
authors.add(cmt_author)
|
||
FLAG = False
|
||
if cmt_author == pr_author:
|
||
FLAG = True
|
||
if pr_author_name is not None and pr_author_name == cmt_author:
|
||
FLAG = True
|
||
if cmt_author.find(pr_author)!= -1:
|
||
FLAG = True
|
||
if pr_author_name is not None and cmt_author.find(pr_author_name)!= -1:
|
||
FLAG = True
|
||
if pr_author_name is not None and pr_author_name.find(cmt_author)!= -1:
|
||
FLAG = True
|
||
|
||
# 最大字符串匹配
|
||
len_th = 3
|
||
if getLongestSubstring(cmt_author, pr_author) > len_th or (pr_author_name is not None and getLongestSubstring(cmt_author,pr_author) > len_th):
|
||
FLAG = True
|
||
|
||
if FLAG:
|
||
real_authors.add(cmt_author)
|
||
cmts.append(prc["sha"])
|
||
print prc["sha"],cmt_author, pr_author, pr_author_name
|
||
get_commit(prc["sha"], un, rn)
|
||
|
||
# print "https://github.com/%s/%s/pull/%d"%(un, rn, pr_num), len(pr_cmts), len(authors)
|
||
# print "\t pr_author:", pr_author, pr_author_name
|
||
# print "\t cmt authors:", authors
|
||
# print "\t cmt real-authors:", real_authors
|
||
|
||
# print "*******"
|
||
return cmts
|
||
|
||
if __name__ == "__main__":
|
||
global PRS
|
||
|
||
repo_names = dbop.select_all("select id,user_name, repo_name from project")
|
||
repo_names = {item[0]:(item[1], item[2]) for item in repo_names}
|
||
|
||
# 取出所有pr
|
||
prs = dbop.select_all("select prj_id, pr_num, author from `pull-request`")
|
||
for pr in prs:
|
||
prj_id, pr_num, pr_author = pr
|
||
un, rn = repo_names[prj_id]
|
||
|
||
if mutl_cmt_authors(prj_id, pr_num):
|
||
PRS.put((prj_id, pr_num, un, rn, pr_author))
|
||
print "total prs: ", PRS.qsize()
|
||
|
||
# 并行采集
|
||
thread_list = []
|
||
threading_num = 20
|
||
print("threads number:%d"%(threading_num,))
|
||
for i in range(0,threading_num):
|
||
t = threading.Thread(target=fetchThread,name="Thread-%d"%i)
|
||
thread_list.append(t)
|
||
|
||
for thread in thread_list:
|
||
thread.start()
|
||
for thread in thread_list:
|
||
thread.join()
|
||
|
||
print("all threads done work")
|
||
|