duppr_analysis/experiment_code/minor_revision/get_real_cmts.py

232 lines
5.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding: utf-8
# 采集repo的issue支持更新
import MySQLdb
import json
import time
import random
import urllib2
import Queue
import threading
PRS = Queue.Queue()
import dbop
dbop.init_pool(3)
import Queue
TOKEN_POOL = Queue.Queue()
with open(".config.json") as fp:
config = json.load(fp)
tokens = config["tokens"]
for token in tokens:
TOKEN_POOL.put(token)
def get_token():
if TOKEN_POOL.empty():
return None
token = TOKEN_POOL.get()
# !!!!应该判断该token是否还有访问机会否则再放回池子里
return token
def push_token(token):
TOKEN_POOL.put(token)
def getLongestSubstring(str1,str2):
longest=0
start_pos1=-1
start_pos2=-1
compares=0 # 记录比较次数
for i in range(len(str1)):
for j in range(len(str2)):
length=0
m=i
n=j
while str1[m]==str2[n]:
compares+=1
length+=1
m+=1
n+=1
if (m>=len(str1))|(n>=len(str2)):
break
if longest<length:
compares+=1
longest=length
start_pos1=i
start_pos2=j
return longest
repo_maps = {}
with open("../prj_par.txt","r")as fp:
for line in fp.readlines():
ps = line.split("\t")
repo_maps[int(ps[0])] = int(ps[1])
import urllib2
cmt_TEMPLATE = "https://api.github.com/repos/%s/%s/commits/%s"
send_headers = {"Content-Type":"application/json","Authorization":None}
def _get_url(url,retry_times=3):
while True:
token = get_token()
if token is None:
continue
send_headers["Authorization"] = "token %s"%token
push_token(token)
break
print "\t%s: "%threading.current_thread().name, url,token
req = urllib2.Request(url,headers = send_headers)
try:
error_msg, result = None,None
result = urllib2.urlopen(req,timeout=20)
raw_data = result.read().decode('utf-8')
except urllib2.HTTPError, e:
error_msg = e.code
except urllib2.URLError, e:
error_msg = e.reason
except Exception,e:
error_msg = e.message
if error_msg != None:
print error_msg
if retry_times == 0:
return None, result
else:
time.sleep(3*(4-retry_times))
return _get_url(url,retry_times-1)
return raw_data,result
def mutl_cmt_authors(repo_id, pr_num):
pr_cmts = dbop.select_one("select raw_json from commit_raw_json where repo_id=%s and pr_num=%s",(repo_maps[repo_id], pr_num))
if pr_cmts is None or pr_cmts[0] is None:
print "None commits"
return False
pr_cmts = json.loads(pr_cmts[0])
authors = set()
for prc in pr_cmts:
cmt_author = prc["commit"]["author"]["name"].lower()
authors.add(cmt_author)
if len(authors) >1:
return True
return False
def fetchThread():
global PRS
print "\t%s: starts to work"%( threading.current_thread().name)
while True:
try:
pr = PRS.get(block=False)
print ("\t%s: fetch"%( threading.current_thread().name)),pr
except Exception,e:
print e, ("\t%s: no more PRS"%( threading.current_thread().name))
break
pr_real_commit(pr)
def get_commit(sha,un, rn):
result = dbop.select_one("select sha from real_cmts where sha=%s",(sha,))
if result is not None:
print "already sha"
return
while True:
url = cmt_TEMPLATE%(un,rn,sha)
print url
ets, result = _get_url(url)
if ets is None:
print "返回none"
continue
dbop.execute("insert into real_cmts (sha,info) values(%s,%s)", (sha, ets))
break
def pr_real_commit(pr):
prj_id, pr_num, un, rn, pr_author = pr
pr_author = pr_author.lower()
pr_author_name = dbop.select_one("select user_name from allrepo_users where name=%s",(pr_author,))
if pr_author_name is not None:
pr_author_name = pr_author_name[0]
if pr_author_name is not None:
pr_author_name = pr_author_name.lower()
pr_cmts = dbop.select_one("select raw_json from commit_raw_json where repo_id=%s and pr_num=%s",(repo_maps[prj_id], pr_num))
if pr_cmts is None or pr_cmts[0] is None:
pass
# print "None commits"
pr_cmts = json.loads(pr_cmts[0])
# print "total commits:", len(pr_cmts)
authors = set()
real_authors = set()
cmts = []
for prc in pr_cmts:
cmt_author = prc["commit"]["author"]["name"].lower()
authors.add(cmt_author)
FLAG = False
if cmt_author == pr_author:
FLAG = True
if pr_author_name is not None and pr_author_name == cmt_author:
FLAG = True
if cmt_author.find(pr_author)!= -1:
FLAG = True
if pr_author_name is not None and cmt_author.find(pr_author_name)!= -1:
FLAG = True
if pr_author_name is not None and pr_author_name.find(cmt_author)!= -1:
FLAG = True
# 最大字符串匹配
len_th = 3
if getLongestSubstring(cmt_author, pr_author) > len_th or (pr_author_name is not None and getLongestSubstring(cmt_author,pr_author) > len_th):
FLAG = True
if FLAG:
real_authors.add(cmt_author)
cmts.append(prc["sha"])
print prc["sha"],cmt_author, pr_author, pr_author_name
get_commit(prc["sha"], un, rn)
# print "https://github.com/%s/%s/pull/%d"%(un, rn, pr_num), len(pr_cmts), len(authors)
# print "\t pr_author:", pr_author, pr_author_name
# print "\t cmt authors:", authors
# print "\t cmt real-authors:", real_authors
# print "*******"
return cmts
if __name__ == "__main__":
global PRS
repo_names = dbop.select_all("select id,user_name, repo_name from project")
repo_names = {item[0]:(item[1], item[2]) for item in repo_names}
# 取出所有pr
prs = dbop.select_all("select prj_id, pr_num, author from `pull-request`")
for pr in prs:
prj_id, pr_num, pr_author = pr
un, rn = repo_names[prj_id]
if mutl_cmt_authors(prj_id, pr_num):
PRS.put((prj_id, pr_num, un, rn, pr_author))
print "total prs: ", PRS.qsize()
# 并行采集
thread_list = []
threading_num = 20
print("threads number:%d"%(threading_num,))
for i in range(0,threading_num):
t = threading.Thread(target=fetchThread,name="Thread-%d"%i)
thread_list.append(t)
for thread in thread_list:
thread.start()
for thread in thread_list:
thread.join()
print("all threads done work")