325 lines
10 KiB
Python
325 lines
10 KiB
Python
#coding:utf-8
|
||
# 计算 redundancy
|
||
|
||
import pandas as pd
|
||
import MySQLdb
|
||
import json
|
||
with open("../_config.json") as fp:
|
||
config = json.load(fp)
|
||
local_db_config = config["local_db"]
|
||
conn = MySQLdb.connect(host=local_db_config["db_host"],user=local_db_config["db_user"],
|
||
passwd=local_db_config["db_passwd"],db=local_db_config["db_name"],port=3306,charset='utf8mb4')
|
||
cursor = conn.cursor()
|
||
|
||
repo_maps = {}
|
||
with open("../prj_par.txt","r")as fp:
|
||
for line in fp.readlines():
|
||
ps = line.split("\t")
|
||
repo_maps[int(ps[0])] = int(ps[1])
|
||
|
||
|
||
def omit_problem_pair(DUP_PRS, NOTE, MST_PR):
|
||
# 去除有问题的pair,只返回有意义对比的
|
||
if NOTE == "None":
|
||
return DUP_PRS
|
||
label, info = NOTE.split(":")
|
||
|
||
DUP_PRS.append(MST_PR)
|
||
P_pr = set()
|
||
if label == "Partial":
|
||
# 不管原本包含几个PR,只要有某两个之间是部分包含,只要把大的那个去掉就可以了。即使小的还有dup的,或者大的还有dup的都可以这样处理
|
||
# O-P, P大,只返回O。
|
||
# O-P-Q, Q确实分别和O, P是部分dup的,O和P也是dup的,因此返回O和P
|
||
# O-P-Q, Q确实分别和O, P是部分dup的,但是O和P不是dup的,因此返回O和P
|
||
# O-P-Q-R, Q确实分别和O, P是部分dup的,但是O和P不是dup的,Q和R是多的,因此返回O、P、R
|
||
for item in info.split(";"):
|
||
P_pr.add(int(item.split("->")[1]))
|
||
if label == "Aware":
|
||
# 把mstpr也算上,删除掉有aware的所有PR
|
||
P_pr = set()
|
||
for item in info.split(";"):
|
||
for _item in item.split("->"):
|
||
P_pr.add(int(_item))
|
||
for _pr in P_pr:
|
||
if _pr in DUP_PRS:
|
||
DUP_PRS.remove(_pr)
|
||
return DUP_PRS
|
||
|
||
def large_pr():
|
||
dups = []
|
||
with open("../major_revision/duplicate_ds.csv","r") as fp:
|
||
for line in fp.readlines()[1:]:
|
||
PRJ_ID, MST_PR, DUP_PRS, NOTE = line.split(",")
|
||
dups.append((int(PRJ_ID), int(MST_PR),
|
||
[int(item) for item in DUP_PRS.split(";")], NOTE.strip()))
|
||
|
||
for item in dups:
|
||
PRJ_ID, MST_PR, DUP_PRS, NOTE = item
|
||
# print PRJ_ID, MST_PR, DUP_PRS, NOTE
|
||
tmp_files, tmp_lines = 0, 0
|
||
dup_prs = omit_problem_pair(DUP_PRS, NOTE, MST_PR)
|
||
if len(dup_prs) == 0:# omit 如果没有返回东西的电话
|
||
continue
|
||
for dup_pr in dup_prs:
|
||
# print "\t", dup_pr
|
||
cursor.execute("select file_name, churn from file_history where prj_id=%s and pr_num=%s",
|
||
(PRJ_ID, dup_pr))
|
||
fs = cursor.fetchall()
|
||
for df in fs:
|
||
tmp_files += 1
|
||
tmp_lines += df[1]
|
||
if tmp_lines == 1032114:
|
||
cursor.execute("select user_name, repo_name from project where id=%s",(PRJ_ID,))
|
||
un, rn = cursor.fetchone()
|
||
print "https://github.com/%s/%s/pull/%d"%(un, rn,dup_pr)
|
||
|
||
def getLongestSubstring(str1,str2):
|
||
longest=0
|
||
start_pos1=-1
|
||
start_pos2=-1
|
||
compares=0 # 记录比较次数
|
||
|
||
for i in range(len(str1)):
|
||
for j in range(len(str2)):
|
||
length=0
|
||
m=i
|
||
n=j
|
||
while str1[m]==str2[n]:
|
||
compares+=1
|
||
length+=1
|
||
m+=1
|
||
n+=1
|
||
if (m>=len(str1))|(n>=len(str2)):
|
||
break
|
||
if longest<length:
|
||
compares+=1
|
||
longest=length
|
||
start_pos1=i
|
||
start_pos2=j
|
||
return longest
|
||
|
||
def mutl_cmt_authors(repo_id, pr_num):
|
||
cursor.execute("select raw_json from commit_raw_json where repo_id=%s and pr_num=%s",(repo_maps[repo_id], pr_num))
|
||
pr_cmts = cursor.fetchone()
|
||
if pr_cmts is None or pr_cmts[0] is None:
|
||
print "None commits"
|
||
return False
|
||
pr_cmts = json.loads(pr_cmts[0])
|
||
authors = set()
|
||
for prc in pr_cmts:
|
||
cmt_author = prc["commit"]["author"]["name"].lower()
|
||
authors.add(cmt_author)
|
||
if len(authors) >1:
|
||
print authors
|
||
return True
|
||
return False
|
||
|
||
|
||
import urllib2
|
||
cmt_TEMPLATE = "https://api.github.com/repos/%s/%s/commits/%s"
|
||
send_headers = {"Content-Type":"application/json","Authorization":"token 623d83b977ead9c6389e607b0e0ae0088e8e27a2"}
|
||
|
||
|
||
def _get_url(url,retry_times=3):
|
||
req = urllib2.Request(url,headers = send_headers)
|
||
try:
|
||
error_msg, result = None,None
|
||
result = urllib2.urlopen(req,timeout=20)
|
||
raw_data = result.read().decode('utf-8')
|
||
except urllib2.HTTPError, e:
|
||
error_msg = e.code
|
||
except urllib2.URLError, e:
|
||
error_msg = e.reason
|
||
except Exception,e:
|
||
error_msg = e.message
|
||
|
||
if error_msg != None:
|
||
print error_msg,e
|
||
if retry_times == 0:
|
||
return None, result
|
||
else:
|
||
time.sleep(3*(4-retry_times))
|
||
return _get_url(url,retry_times-1)
|
||
return raw_data,result
|
||
|
||
def get_commit(sha,repo_id):
|
||
cursor.execute("select sha from real_cmts where sha=%s",(sha,))
|
||
result = cursor.fetchone()
|
||
if result is not None:
|
||
print "already sha"
|
||
return
|
||
cursor.execute("select user_name, repo_name from project where id=%s",(repo_id,))
|
||
un, rn = cursor.fetchone()
|
||
while True:
|
||
url = cmt_TEMPLATE%(un,rn,sha)
|
||
print url
|
||
ets, result = _get_url(url)
|
||
if ets is None:
|
||
print "返回none"
|
||
continue
|
||
cursor.execute("insert into real_cmts (sha,info) values(%s,%s)", (sha, ets))
|
||
conn.commit()
|
||
break
|
||
|
||
|
||
def pr_real_commit(repo_id, pr_num):
|
||
cursor.execute("select user_name, repo_name from project where id=%s",(repo_id,))
|
||
un, rn = cursor.fetchone()
|
||
cursor.execute("select author from `pull-request` where prj_id=%s and pr_num=%s",(repo_id, pr_num))
|
||
pr_author = cursor.fetchone()[0]
|
||
cursor.execute("select user_name from allrepo_users where name=%s",(pr_author,))
|
||
pr_author_name = cursor.fetchone()[0]
|
||
|
||
pr_author = pr_author.lower()
|
||
if pr_author_name is not None:
|
||
pr_author_name = pr_author_name.lower()
|
||
|
||
cursor.execute("select raw_json from commit_raw_json where repo_id=%s and pr_num=%s",(repo_maps[repo_id], pr_num))
|
||
pr_cmts = cursor.fetchone()
|
||
if pr_cmts is None or pr_cmts[0] is None:
|
||
pass
|
||
# print "None commits"
|
||
pr_cmts = json.loads(pr_cmts[0])
|
||
# print "total commits:", len(pr_cmts)
|
||
authors = set()
|
||
real_authors = set()
|
||
cmts = []
|
||
for prc in pr_cmts:
|
||
cmt_author = prc["commit"]["author"]["name"].lower()
|
||
authors.add(cmt_author)
|
||
FLAG = False
|
||
if cmt_author == pr_author:
|
||
FLAG = True
|
||
if pr_author_name is not None and pr_author_name == cmt_author:
|
||
FLAG = True
|
||
if cmt_author.find(pr_author)!= -1:
|
||
FLAG = True
|
||
if pr_author_name is not None and cmt_author.find(pr_author_name)!= -1:
|
||
FLAG = True
|
||
if pr_author_name is not None and pr_author_name.find(cmt_author)!= -1:
|
||
FLAG = True
|
||
|
||
# 最大字符串匹配
|
||
len_th = 3
|
||
if getLongestSubstring(cmt_author, pr_author) > len_th or (pr_author_name is not None and getLongestSubstring(cmt_author,pr_author) > len_th):
|
||
FLAG = True
|
||
|
||
if FLAG:
|
||
real_authors.add(cmt_author)
|
||
cmts.append(prc["sha"])
|
||
# print prc["sha"],cmt_author, pr_author, pr_author_name
|
||
# get_commit(prc["sha"], repo_id)
|
||
|
||
# print "https://github.com/%s/%s/pull/%d"%(un, rn, pr_num), len(pr_cmts), len(authors)
|
||
# print "\t pr_author:", pr_author, pr_author_name
|
||
# print "\t cmt authors:", authors
|
||
# print "\t cmt real-authors:", real_authors
|
||
|
||
# print "*******"
|
||
return cmts
|
||
|
||
def crawl_mul_author():
|
||
# 找出一个PR的所有 commit 的author,commitor
|
||
# 看这些人 set中的个数多个两个的,就选出来
|
||
# 剩下最多的看看是哪个?
|
||
|
||
dups = []
|
||
with open("../major_revision/duplicate_ds.csv","r") as fp:
|
||
for line in fp.readlines()[1:]:
|
||
PRJ_ID, MST_PR, DUP_PRS, NOTE = line.split(",")
|
||
dups.append((int(PRJ_ID), int(MST_PR),
|
||
[int(item) for item in DUP_PRS.split(";")], NOTE.strip()))
|
||
|
||
for item in dups:
|
||
PRJ_ID, MST_PR, DUP_PRS, NOTE = item
|
||
# print PRJ_ID, MST_PR, DUP_PRS, NOTE
|
||
tmp_files, tmp_lines = 0, 0
|
||
dup_prs = omit_problem_pair(DUP_PRS, NOTE, MST_PR)
|
||
if len(dup_prs) == 0:# omit 如果没有返回东西的电话
|
||
continue
|
||
for dup_pr in dup_prs:
|
||
if mutl_cmt_authors(PRJ_ID, dup_pr):
|
||
pr_real_commit(PRJ_ID, dup_pr)
|
||
|
||
|
||
|
||
def redundancy():
|
||
dups = []
|
||
with open("../major_revision/duplicate_ds.csv","r") as fp:
|
||
for line in fp.readlines()[1:]:
|
||
PRJ_ID, MST_PR, DUP_PRS, NOTE = line.split(",")
|
||
dups.append((int(PRJ_ID), int(MST_PR),
|
||
[int(item) for item in DUP_PRS.split(";")], NOTE.strip()))
|
||
|
||
print "redundant contributor resource:"
|
||
files = []
|
||
lines = []
|
||
for item in dups:
|
||
PRJ_ID, MST_PR, DUP_PRS, NOTE = item
|
||
# print PRJ_ID, MST_PR, DUP_PRS, NOTE
|
||
tmp_files, tmp_lines = 0, 0
|
||
dup_prs = omit_problem_pair(DUP_PRS, NOTE, MST_PR)
|
||
if len(dup_prs) == 0:# omit 如果没有返回东西的电话
|
||
continue
|
||
for dup_pr in dup_prs:
|
||
# print "\t", dup_pr
|
||
|
||
cursor.execute("select additions, deletions, changed_files from pr_changes where repo_id=%s and pr_num=%s",
|
||
(PRJ_ID, dup_pr))
|
||
fs = cursor.fetchone()
|
||
tmp_files += fs[2]
|
||
tmp_lines += fs[0] + fs[1]
|
||
|
||
|
||
files.append(tmp_files)
|
||
lines.append(tmp_lines)
|
||
# if tmp_lines > 30000:
|
||
if tmp_files == 0:
|
||
print tmp_files, tmp_lines
|
||
cursor.execute("select user_name, repo_name from project where id=%s",(PRJ_ID,))
|
||
un, rn = cursor.fetchone()
|
||
print "https://github.com/%s/%s/pull/%d"%(un, rn, MST_PR)
|
||
for dup_pr in dup_prs:
|
||
print "https://github.com/%s/%s/pull/%d"%(un, rn, dup_pr)
|
||
print "****"*20
|
||
|
||
print pd.Series(files).describe()
|
||
print pd.Series(lines).describe()
|
||
# print lines
|
||
|
||
|
||
def get_all_real_commit():
|
||
prs = cursor.execute("select id, prj_id, pr_num,author from `pull-request`")
|
||
prs = cursor.fetchall()
|
||
for pr in prs:
|
||
pr_id, prj_id, pr_num, pr_author = pr
|
||
if mutl_cmt_authors(prj_id, pr_num):
|
||
cmts = pr_real_commit(prj_id, pr_num)
|
||
|
||
def mutl_author_pr():
|
||
dups = []
|
||
with open("../major_revision/duplicate_ds.csv","r") as fp:
|
||
for line in fp.readlines()[1:]:
|
||
PRJ_ID, MST_PR, DUP_PRS, NOTE = line.split(",")
|
||
dups.append((int(PRJ_ID), int(MST_PR),
|
||
[int(item) for item in DUP_PRS.split(";")], NOTE.strip()))
|
||
|
||
|
||
for item in dups:
|
||
PRJ_ID, MST_PR, DUP_PRS, NOTE = item
|
||
dup_prs = omit_problem_pair(DUP_PRS, NOTE, MST_PR)
|
||
if len(dup_prs) == 0:# omit 如果没有返回东西的电话
|
||
continue
|
||
for dup_pr in dup_prs:
|
||
if mutl_cmt_authors(PRJ_ID, dup_pr):
|
||
cursor.execute("select user_name, repo_name from project where id=%s",(PRJ_ID,))
|
||
un, rn = cursor.fetchone()
|
||
print "https://github.com/%s/%s/pull/%d"%(un, rn, dup_pr)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# large_pr()
|
||
redundancy()
|
||
# get_all_real_commit()
|
||
|
||
# mutl_author_pr() |