duppr_analysis/experiment_code/minor_revision/larg_pr.py

325 lines
10 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#coding:utf-8
# 计算 redundancy
import pandas as pd
import MySQLdb
import json
with open("../_config.json") as fp:
config = json.load(fp)
local_db_config = config["local_db"]
conn = MySQLdb.connect(host=local_db_config["db_host"],user=local_db_config["db_user"],
passwd=local_db_config["db_passwd"],db=local_db_config["db_name"],port=3306,charset='utf8mb4')
cursor = conn.cursor()
repo_maps = {}
with open("../prj_par.txt","r")as fp:
for line in fp.readlines():
ps = line.split("\t")
repo_maps[int(ps[0])] = int(ps[1])
def omit_problem_pair(DUP_PRS, NOTE, MST_PR):
# 去除有问题的pair只返回有意义对比的
if NOTE == "None":
return DUP_PRS
label, info = NOTE.split(":")
DUP_PRS.append(MST_PR)
P_pr = set()
if label == "Partial":
# 不管原本包含几个PR只要有某两个之间是部分包含只要把大的那个去掉就可以了。即使小的还有dup的或者大的还有dup的都可以这样处理
# O-P, P大只返回O。
# O-P-Q, Q确实分别和O, P是部分dup的O和P也是dup的因此返回O和P
# O-P-Q, Q确实分别和O, P是部分dup的但是O和P不是dup的因此返回O和P
# O-P-Q-R, Q确实分别和O, P是部分dup的但是O和P不是dup的Q和R是多的因此返回O、P、R
for item in info.split(";"):
P_pr.add(int(item.split("->")[1]))
if label == "Aware":
# 把mstpr也算上删除掉有aware的所有PR
P_pr = set()
for item in info.split(";"):
for _item in item.split("->"):
P_pr.add(int(_item))
for _pr in P_pr:
if _pr in DUP_PRS:
DUP_PRS.remove(_pr)
return DUP_PRS
def large_pr():
dups = []
with open("../major_revision/duplicate_ds.csv","r") as fp:
for line in fp.readlines()[1:]:
PRJ_ID, MST_PR, DUP_PRS, NOTE = line.split(",")
dups.append((int(PRJ_ID), int(MST_PR),
[int(item) for item in DUP_PRS.split(";")], NOTE.strip()))
for item in dups:
PRJ_ID, MST_PR, DUP_PRS, NOTE = item
# print PRJ_ID, MST_PR, DUP_PRS, NOTE
tmp_files, tmp_lines = 0, 0
dup_prs = omit_problem_pair(DUP_PRS, NOTE, MST_PR)
if len(dup_prs) == 0:# omit 如果没有返回东西的电话
continue
for dup_pr in dup_prs:
# print "\t", dup_pr
cursor.execute("select file_name, churn from file_history where prj_id=%s and pr_num=%s",
(PRJ_ID, dup_pr))
fs = cursor.fetchall()
for df in fs:
tmp_files += 1
tmp_lines += df[1]
if tmp_lines == 1032114:
cursor.execute("select user_name, repo_name from project where id=%s",(PRJ_ID,))
un, rn = cursor.fetchone()
print "https://github.com/%s/%s/pull/%d"%(un, rn,dup_pr)
def getLongestSubstring(str1,str2):
longest=0
start_pos1=-1
start_pos2=-1
compares=0 # 记录比较次数
for i in range(len(str1)):
for j in range(len(str2)):
length=0
m=i
n=j
while str1[m]==str2[n]:
compares+=1
length+=1
m+=1
n+=1
if (m>=len(str1))|(n>=len(str2)):
break
if longest<length:
compares+=1
longest=length
start_pos1=i
start_pos2=j
return longest
def mutl_cmt_authors(repo_id, pr_num):
cursor.execute("select raw_json from commit_raw_json where repo_id=%s and pr_num=%s",(repo_maps[repo_id], pr_num))
pr_cmts = cursor.fetchone()
if pr_cmts is None or pr_cmts[0] is None:
print "None commits"
return False
pr_cmts = json.loads(pr_cmts[0])
authors = set()
for prc in pr_cmts:
cmt_author = prc["commit"]["author"]["name"].lower()
authors.add(cmt_author)
if len(authors) >1:
print authors
return True
return False
import urllib2
cmt_TEMPLATE = "https://api.github.com/repos/%s/%s/commits/%s"
send_headers = {"Content-Type":"application/json","Authorization":"token 623d83b977ead9c6389e607b0e0ae0088e8e27a2"}
def _get_url(url,retry_times=3):
req = urllib2.Request(url,headers = send_headers)
try:
error_msg, result = None,None
result = urllib2.urlopen(req,timeout=20)
raw_data = result.read().decode('utf-8')
except urllib2.HTTPError, e:
error_msg = e.code
except urllib2.URLError, e:
error_msg = e.reason
except Exception,e:
error_msg = e.message
if error_msg != None:
print error_msg,e
if retry_times == 0:
return None, result
else:
time.sleep(3*(4-retry_times))
return _get_url(url,retry_times-1)
return raw_data,result
def get_commit(sha,repo_id):
cursor.execute("select sha from real_cmts where sha=%s",(sha,))
result = cursor.fetchone()
if result is not None:
print "already sha"
return
cursor.execute("select user_name, repo_name from project where id=%s",(repo_id,))
un, rn = cursor.fetchone()
while True:
url = cmt_TEMPLATE%(un,rn,sha)
print url
ets, result = _get_url(url)
if ets is None:
print "返回none"
continue
cursor.execute("insert into real_cmts (sha,info) values(%s,%s)", (sha, ets))
conn.commit()
break
def pr_real_commit(repo_id, pr_num):
cursor.execute("select user_name, repo_name from project where id=%s",(repo_id,))
un, rn = cursor.fetchone()
cursor.execute("select author from `pull-request` where prj_id=%s and pr_num=%s",(repo_id, pr_num))
pr_author = cursor.fetchone()[0]
cursor.execute("select user_name from allrepo_users where name=%s",(pr_author,))
pr_author_name = cursor.fetchone()[0]
pr_author = pr_author.lower()
if pr_author_name is not None:
pr_author_name = pr_author_name.lower()
cursor.execute("select raw_json from commit_raw_json where repo_id=%s and pr_num=%s",(repo_maps[repo_id], pr_num))
pr_cmts = cursor.fetchone()
if pr_cmts is None or pr_cmts[0] is None:
pass
# print "None commits"
pr_cmts = json.loads(pr_cmts[0])
# print "total commits:", len(pr_cmts)
authors = set()
real_authors = set()
cmts = []
for prc in pr_cmts:
cmt_author = prc["commit"]["author"]["name"].lower()
authors.add(cmt_author)
FLAG = False
if cmt_author == pr_author:
FLAG = True
if pr_author_name is not None and pr_author_name == cmt_author:
FLAG = True
if cmt_author.find(pr_author)!= -1:
FLAG = True
if pr_author_name is not None and cmt_author.find(pr_author_name)!= -1:
FLAG = True
if pr_author_name is not None and pr_author_name.find(cmt_author)!= -1:
FLAG = True
# 最大字符串匹配
len_th = 3
if getLongestSubstring(cmt_author, pr_author) > len_th or (pr_author_name is not None and getLongestSubstring(cmt_author,pr_author) > len_th):
FLAG = True
if FLAG:
real_authors.add(cmt_author)
cmts.append(prc["sha"])
# print prc["sha"],cmt_author, pr_author, pr_author_name
# get_commit(prc["sha"], repo_id)
# print "https://github.com/%s/%s/pull/%d"%(un, rn, pr_num), len(pr_cmts), len(authors)
# print "\t pr_author:", pr_author, pr_author_name
# print "\t cmt authors:", authors
# print "\t cmt real-authors:", real_authors
# print "*******"
return cmts
def crawl_mul_author():
# 找出一个PR的所有 commit 的authorcommitor
# 看这些人 set中的个数多个两个的就选出来
# 剩下最多的看看是哪个?
dups = []
with open("../major_revision/duplicate_ds.csv","r") as fp:
for line in fp.readlines()[1:]:
PRJ_ID, MST_PR, DUP_PRS, NOTE = line.split(",")
dups.append((int(PRJ_ID), int(MST_PR),
[int(item) for item in DUP_PRS.split(";")], NOTE.strip()))
for item in dups:
PRJ_ID, MST_PR, DUP_PRS, NOTE = item
# print PRJ_ID, MST_PR, DUP_PRS, NOTE
tmp_files, tmp_lines = 0, 0
dup_prs = omit_problem_pair(DUP_PRS, NOTE, MST_PR)
if len(dup_prs) == 0:# omit 如果没有返回东西的电话
continue
for dup_pr in dup_prs:
if mutl_cmt_authors(PRJ_ID, dup_pr):
pr_real_commit(PRJ_ID, dup_pr)
def redundancy():
dups = []
with open("../major_revision/duplicate_ds.csv","r") as fp:
for line in fp.readlines()[1:]:
PRJ_ID, MST_PR, DUP_PRS, NOTE = line.split(",")
dups.append((int(PRJ_ID), int(MST_PR),
[int(item) for item in DUP_PRS.split(";")], NOTE.strip()))
print "redundant contributor resource:"
files = []
lines = []
for item in dups:
PRJ_ID, MST_PR, DUP_PRS, NOTE = item
# print PRJ_ID, MST_PR, DUP_PRS, NOTE
tmp_files, tmp_lines = 0, 0
dup_prs = omit_problem_pair(DUP_PRS, NOTE, MST_PR)
if len(dup_prs) == 0:# omit 如果没有返回东西的电话
continue
for dup_pr in dup_prs:
# print "\t", dup_pr
cursor.execute("select additions, deletions, changed_files from pr_changes where repo_id=%s and pr_num=%s",
(PRJ_ID, dup_pr))
fs = cursor.fetchone()
tmp_files += fs[2]
tmp_lines += fs[0] + fs[1]
files.append(tmp_files)
lines.append(tmp_lines)
# if tmp_lines > 30000:
if tmp_files == 0:
print tmp_files, tmp_lines
cursor.execute("select user_name, repo_name from project where id=%s",(PRJ_ID,))
un, rn = cursor.fetchone()
print "https://github.com/%s/%s/pull/%d"%(un, rn, MST_PR)
for dup_pr in dup_prs:
print "https://github.com/%s/%s/pull/%d"%(un, rn, dup_pr)
print "****"*20
print pd.Series(files).describe()
print pd.Series(lines).describe()
# print lines
def get_all_real_commit():
prs = cursor.execute("select id, prj_id, pr_num,author from `pull-request`")
prs = cursor.fetchall()
for pr in prs:
pr_id, prj_id, pr_num, pr_author = pr
if mutl_cmt_authors(prj_id, pr_num):
cmts = pr_real_commit(prj_id, pr_num)
def mutl_author_pr():
dups = []
with open("../major_revision/duplicate_ds.csv","r") as fp:
for line in fp.readlines()[1:]:
PRJ_ID, MST_PR, DUP_PRS, NOTE = line.split(",")
dups.append((int(PRJ_ID), int(MST_PR),
[int(item) for item in DUP_PRS.split(";")], NOTE.strip()))
for item in dups:
PRJ_ID, MST_PR, DUP_PRS, NOTE = item
dup_prs = omit_problem_pair(DUP_PRS, NOTE, MST_PR)
if len(dup_prs) == 0:# omit 如果没有返回东西的电话
continue
for dup_pr in dup_prs:
if mutl_cmt_authors(PRJ_ID, dup_pr):
cursor.execute("select user_name, repo_name from project where id=%s",(PRJ_ID,))
un, rn = cursor.fetchone()
print "https://github.com/%s/%s/pull/%d"%(un, rn, dup_pr)
if __name__ == "__main__":
# large_pr()
redundancy()
# get_all_real_commit()
# mutl_author_pr()