505 lines
15 KiB
Python
505 lines
15 KiB
Python
# coding:utf-8
|
||
import dbop
|
||
dbop.init_pool(3)
|
||
import json
|
||
|
||
repo_maps = {}
|
||
with open("../prj_par.txt","r")as fp:
|
||
for line in fp.readlines():
|
||
ps = line.split("\t")
|
||
repo_maps[int(ps[0])] = int(ps[1])
|
||
|
||
import MySQLdb
|
||
import json
|
||
with open("../_config.json") as fp:
|
||
config = json.load(fp)
|
||
local_db_config = config["local_db"]
|
||
conn = MySQLdb.connect(host=local_db_config["db_host"],user=local_db_config["db_user"],
|
||
passwd=local_db_config["db_passwd"],db=local_db_config["db_name"],port=3306,charset='utf8mb4')
|
||
cursor = conn.cursor()
|
||
|
||
|
||
def extract_pr_change():
|
||
ch_sql = "insert into pr_changes (pr_id, repo_id, pr_num, additions, deletions, changed_files)values (%s,%s,%s,%s,%s,%s)"
|
||
# 取出所有pr
|
||
prs = dbop.select_all("select id, prj_id, pr_num from `pull-request`")
|
||
for pr in prs:
|
||
pr_id, prj_id, pr_num = pr
|
||
pr_json = dbop.select_one("select raw_json from pr_raw_json where repo_id=%s and pr_num=%s",
|
||
(repo_maps[prj_id], pr_num))
|
||
if pr_json is None:
|
||
print "no json", pr
|
||
continue
|
||
pr_json = json.loads(pr_json[0])
|
||
|
||
dbop.execute(ch_sql, (pr_id, prj_id, pr_num, pr_json["additions"], pr_json["deletions"], pr_json["changed_files"]))
|
||
|
||
def update_extract_pr_change():
|
||
prs = dbop.select_all("select pr_id, repo_id, pr_num from pr_changes where additions=0 and deletions=0 and changed_files=0")
|
||
print len(prs)
|
||
count = 0
|
||
for pr in prs:
|
||
print count
|
||
pr_id, prj_id, pr_num = pr
|
||
pr_json = dbop.select_one("select raw_json from pr_raw_json where repo_id=%s and pr_num=%s",
|
||
(repo_maps[prj_id], pr_num))
|
||
if pr_json is None or pr_json[0] is None or len(pr_json[0].strip()) == 0:
|
||
print "no json", pr
|
||
continue
|
||
print repo_maps[prj_id], pr_num
|
||
try:
|
||
pr_json = json.loads(pr_json[0])
|
||
ch_sql = "update pr_changes set additions=%s, deletions=%s, changed_files=%s where pr_id=%s "
|
||
dbop.execute(ch_sql, (pr_json["additions"], pr_json["deletions"], pr_json["changed_files"],pr_id))
|
||
except Exception,e:
|
||
print "error"
|
||
count += 1
|
||
|
||
def getLongestSubstring(str1,str2):
|
||
longest=0
|
||
start_pos1=-1
|
||
start_pos2=-1
|
||
compares=0 # 记录比较次数
|
||
|
||
for i in range(len(str1)):
|
||
for j in range(len(str2)):
|
||
length=0
|
||
m=i
|
||
n=j
|
||
while str1[m]==str2[n]:
|
||
compares+=1
|
||
length+=1
|
||
m+=1
|
||
n+=1
|
||
if (m>=len(str1))|(n>=len(str2)):
|
||
break
|
||
if longest<length:
|
||
compares+=1
|
||
longest=length
|
||
start_pos1=i
|
||
start_pos2=j
|
||
return longest
|
||
|
||
|
||
def mutl_cmt_authors(repo_id, pr_num):
|
||
cursor.execute("select raw_json from commit_raw_json where repo_id=%s and pr_num=%s",(repo_maps[repo_id], pr_num))
|
||
pr_cmts = cursor.fetchone()
|
||
if pr_cmts is None or pr_cmts[0] is None:
|
||
print "None commits"
|
||
return False
|
||
pr_cmts = json.loads(pr_cmts[0])
|
||
authors = set()
|
||
for prc in pr_cmts:
|
||
cmt_author = prc["commit"]["author"]["name"].lower()
|
||
authors.add(cmt_author)
|
||
if len(authors) >1:
|
||
return True
|
||
return False
|
||
|
||
|
||
def _parse_chunks(chunk_raw):
|
||
chunks = []
|
||
chunk_lns = chunk_raw.split("\n")
|
||
chunk_start_at = -1
|
||
for cln in chunk_lns:
|
||
if cln.startswith("@@"):
|
||
chunk_start_at = int(cln[3:].split(",")[0][1:])
|
||
else:
|
||
if cln.startswith("-") or cln.startswith("+"):
|
||
chunks.append((cln[0], chunk_start_at))
|
||
chunk_start_at += 1
|
||
return chunks
|
||
|
||
|
||
def _patch_merge(patches):
|
||
files, dels, adds = set(), {},{}
|
||
for patch in patches:
|
||
for fl in patch:
|
||
if "patch" not in fl:
|
||
continue
|
||
file_name, file_chunks = fl["filename"], fl["patch"]
|
||
files.add(file_name)
|
||
if file_name not in dels:
|
||
dels[file_name] = []
|
||
if file_name not in adds:
|
||
adds[file_name] = []
|
||
chunks = _parse_chunks(file_chunks)
|
||
|
||
for chl in chunks:
|
||
change_type = cnl[0]
|
||
change_line = cnl[1]
|
||
|
||
|
||
if change_type == "-":
|
||
|
||
# 先对比删除集合
|
||
already_dels = dels[file_name]
|
||
if change_line in already_dels:
|
||
pass
|
||
else:
|
||
already_dels.append(change_line)
|
||
for adl in range(0,len(already_dels)):
|
||
if already_dels[adl] < change_line:
|
||
dels[file_name][adl] -= 1
|
||
|
||
|
||
def _cmt_changes(cmts):
|
||
files, adds, dels = set(),0,0
|
||
for cmt in cmts:
|
||
cursor.execute("select info from not_real_cmts where sha=%s",(cmt,))
|
||
path = cursor.fetchone()
|
||
if path is None:
|
||
cursor.execute("select info from real_cmts where sha=%s",(cmt,))
|
||
path = cursor.fetchone()
|
||
if path is None:
|
||
continue
|
||
try:
|
||
sha_json = json.loads(path[0])["files"]
|
||
except Exception,e:
|
||
print "error json",cmt
|
||
return 0,0,0
|
||
for pf in sha_json:
|
||
if "filename" in pf:
|
||
files.add(pf["filename"])
|
||
adds += pf["additions"]
|
||
dels += pf["deletions"]
|
||
return len(files), adds, dels
|
||
def _get_real_change(repo_id, pr_num, old_changs):
|
||
# print "get real change", repo_id, pr_num
|
||
old_adds, old_dels, old_files = old_changs
|
||
files, adds, dels = set(),0,0
|
||
cmts, not_cmts = pr_real_commit(repo_id, pr_num)
|
||
print len(cmts), len(not_cmts)
|
||
|
||
r_f, r_a, r_d = _cmt_changes(cmts)
|
||
n_f, n_a, n_d = _cmt_changes(not_cmts)
|
||
print r_f, r_a, r_d, n_f, n_a, n_d
|
||
|
||
if r_f > old_files:
|
||
r_f = old_files
|
||
if len(cmts) <= 1:
|
||
if r_a > old_adds:
|
||
r_a = old_adds
|
||
if r_d > old_dels:
|
||
r_d = old_dels
|
||
return r_f, r_a, r_d
|
||
else:
|
||
if n_a < old_adds:
|
||
r_a = old_adds - n_a
|
||
if n_d < old_dels:
|
||
r_d = old_dels - n_d
|
||
|
||
if r_a > old_adds:
|
||
r_a = old_adds
|
||
if r_d > old_dels:
|
||
r_d = old_dels
|
||
|
||
return r_f, r_a, r_d
|
||
|
||
def pr_real_commit(repo_id, pr_num):
|
||
cursor.execute("select user_name, repo_name from project where id=%s",(repo_id,))
|
||
un, rn = cursor.fetchone()
|
||
cursor.execute("select author from `pull-request` where prj_id=%s and pr_num=%s",(repo_id, pr_num))
|
||
pr_author = cursor.fetchone()[0]
|
||
cursor.execute("select user_name from allrepo_users where name=%s",(pr_author,))
|
||
pr_author_name = cursor.fetchone()
|
||
if pr_author_name is not None:
|
||
pr_author_name = pr_author_name[0]
|
||
|
||
pr_author = pr_author.lower()
|
||
if pr_author_name is not None:
|
||
pr_author_name = pr_author_name.lower()
|
||
|
||
cursor.execute("select raw_json from commit_raw_json where repo_id=%s and pr_num=%s",(repo_maps[repo_id], pr_num))
|
||
pr_cmts = cursor.fetchone()
|
||
if pr_cmts is None or pr_cmts[0] is None:
|
||
pass
|
||
# print "None commits"
|
||
pr_cmts = json.loads(pr_cmts[0])
|
||
# print "total commits:", len(pr_cmts)
|
||
authors = set()
|
||
real_authors = set()
|
||
cmts = []
|
||
not_cmts = []
|
||
for prc in pr_cmts:
|
||
cmt_author = prc["commit"]["author"]["name"].lower()
|
||
authors.add(cmt_author)
|
||
FLAG = False
|
||
if cmt_author == pr_author:
|
||
FLAG = True
|
||
if pr_author_name is not None and pr_author_name == cmt_author:
|
||
FLAG = True
|
||
if cmt_author.find(pr_author)!= -1:
|
||
FLAG = True
|
||
if pr_author_name is not None and cmt_author.find(pr_author_name)!= -1:
|
||
FLAG = True
|
||
if pr_author_name is not None and pr_author_name.find(cmt_author)!= -1:
|
||
FLAG = True
|
||
|
||
# 最大字符串匹配
|
||
len_th = 3
|
||
if getLongestSubstring(cmt_author, pr_author) > len_th or (pr_author_name is not None and getLongestSubstring(cmt_author,pr_author) > len_th):
|
||
FLAG = True
|
||
|
||
if FLAG:
|
||
real_authors.add(cmt_author)
|
||
cmts.append(prc["sha"])
|
||
else:
|
||
not_cmts.append(prc["sha"])
|
||
# print prc["sha"],cmt_author, pr_author, pr_author_name
|
||
# get_commit(prc["sha"], repo_id)
|
||
|
||
# print "https://github.com/%s/%s/pull/%d"%(un, rn, pr_num), len(pr_cmts), len(authors)
|
||
# print "\t pr_author:", pr_author, pr_author_name
|
||
# print "\t cmt authors:", authors
|
||
# print "\t cmt real-authors:", real_authors
|
||
|
||
# print "*******"
|
||
return cmts,not_cmts
|
||
|
||
def update_rq2():
|
||
cursor.execute("select id, prj_id, pr_num, file_changed, loc from rq2_metrics")
|
||
result = cursor.fetchall()
|
||
|
||
for item in result:
|
||
pr_id, prj_id, pr_num, file_changed, loc = item
|
||
# 先判断是不是多个author
|
||
if mutl_cmt_authors(prj_id, pr_num):
|
||
files, locs = _get_real_change(prj_id, pr_num)
|
||
else:
|
||
cursor.execute("select additions, deletions, changed_files from pr_changes where repo_id=%s and pr_num=%s",
|
||
(PRJ_ID, dup_pr))
|
||
fs = cursor.fetchone()
|
||
files = fs[2]
|
||
locs = fs[0] + fs[1]
|
||
|
||
# 更新
|
||
cursor.execute("update rq2_metrics set file_changes=%, loc=%s where id=%s",
|
||
(files, locs, pr_id))
|
||
conn.commit()
|
||
print file_changed, loc, files, locs
|
||
|
||
def update_rq3():
|
||
cursor.execute("select id, repo_id, pr_num, files, churn from rq3_metrics")
|
||
result = cursor.fetchall()
|
||
|
||
for item in result:
|
||
pr_id, prj_id, pr_num, file_changed, loc = item
|
||
# 先判断是不是多个author
|
||
if mutl_cmt_authors(prj_id, pr_num):
|
||
files, locs = _get_real_change(prj_id, pr_num)
|
||
else:
|
||
cursor.execute("select additions, deletions, changed_files from pr_changes where repo_id=%s and pr_num=%s",
|
||
(PRJ_ID, dup_pr))
|
||
fs = cursor.fetchone()
|
||
files = fs[2]
|
||
locs = fs[0] + fs[1]
|
||
|
||
# 更新
|
||
cursor.execute("update rq3_metrics set files=%, churn=%s where id=%s",
|
||
(files, locs, pr_id))
|
||
conn.commit()
|
||
print file_changed, loc, files, locs
|
||
def mutl_cmt_authors(repo_id, pr_num):
|
||
cursor.execute("select raw_json from commit_raw_json where repo_id=%s and pr_num=%s",(repo_maps[repo_id], pr_num))
|
||
pr_cmts = cursor.fetchone()
|
||
if pr_cmts is None or pr_cmts[0] is None:
|
||
print "None commits"
|
||
return False
|
||
pr_cmts = json.loads(pr_cmts[0])
|
||
authors = set()
|
||
for prc in pr_cmts:
|
||
cmt_author = prc["commit"]["author"]["name"].lower()
|
||
authors.add(cmt_author)
|
||
if len(authors) >1:
|
||
return True
|
||
return False
|
||
|
||
def pr_work(pr_id, prj_id, pr_num):
|
||
# 是否混入了别人的commit,这个时候先不管了, 都不区分了
|
||
|
||
# reviews
|
||
pr_author = dbop.select_one("select author from `pull-request` where id=%s",(pr_id,))[0]
|
||
|
||
result = dbop.select_all("select id,author from comment where pr_id=%s",(pr_id,))
|
||
rw = len(result)
|
||
authors = [item[1] for item in result if item[1]!=pr_author]
|
||
|
||
# files
|
||
fl, lc = 0, 0
|
||
# # 如果去掉混入多个author的呢?
|
||
# if mutl_cmt_authors(prj_id, pr_num):
|
||
# pass
|
||
# else:
|
||
# result = dbop.select_one("select changed_files, additions, deletions from pr_changes where pr_id=%s",(pr_id,))
|
||
# if result is not None:
|
||
# fl += result[0]
|
||
# lc += result[1] + result[2]
|
||
# else:
|
||
# fl, lc = 0,0
|
||
|
||
# check
|
||
cursor.execute("select ci_status from pr_ci where repo_id=%s and pr_num=%s",
|
||
(prj_id, pr_num))
|
||
checks = cursor.fetchall()
|
||
cmt_cc, cnt_cc = 0,0
|
||
for check in checks:
|
||
check = json.loads(check[0])
|
||
if len(check["statuses"]) != 0:
|
||
cmt_cc += 1
|
||
cnt_cc += len(check["statuses"])
|
||
return rw, fl, lc, cmt_cc, cnt_cc ,authors
|
||
|
||
def work_compare():
|
||
# 所有PR的统计信息
|
||
prs = dbop.select_all("select id, prj_id, pr_num,author from `pull-request`")
|
||
reviews, files, locs, cmt_checks, cnt_checks = 0,0,0,0,0
|
||
cnt = set()
|
||
rw_authors = []
|
||
print len(prs)
|
||
for pr in prs:
|
||
pr_id, prj_id, pr_num, pr_author = pr
|
||
cnt.add(pr_author)
|
||
rw, fl, lc, cmt_cc, cnt_cc,rwer = pr_work(pr_id, prj_id, pr_num)
|
||
reviews += rw
|
||
rw_authors.extend(rwer)
|
||
# print pr_id, rw, reviews
|
||
files += fl
|
||
locs += lc
|
||
cmt_checks += cmt_cc
|
||
cnt_checks += cnt_cc
|
||
print len(cnt), files, locs, reviews, cmt_checks, cnt_checks, len(set(rw_authors))
|
||
output = (files, locs, reviews, cmt_checks, cnt_checks,len(cnt))
|
||
|
||
prs = dbop.select_all("select prj_id, pr_num from rq2_metrics where is_dup=1")
|
||
reviews, files, locs, cmt_checks, cnt_checks = 0,0,0,0,0
|
||
rw_authors = []
|
||
print len(prs)
|
||
cnt = set()
|
||
for pr in prs:
|
||
prj_id, pr_num = pr
|
||
cursor.execute("select id,author from `pull-request` where prj_id=%s and pr_num=%s", (prj_id, pr_num))
|
||
pr_id,pr_author = cursor.fetchone()
|
||
cnt.add(pr_author)
|
||
rw, fl, lc, cmt_cc, cnt_cc,rwer = pr_work(pr_id, prj_id, pr_num)
|
||
reviews += rw
|
||
rw_authors.extend(rwer)
|
||
files += fl
|
||
locs += lc
|
||
cmt_checks += cmt_cc
|
||
cnt_checks += cnt_cc
|
||
print len(cnt),files, locs,reviews, cmt_checks, cnt_checks,len(set(rw_authors))
|
||
print files*1.0/output[0], locs*1.0/output[1], reviews*1.0/output[2], cmt_checks*1.0/output[3],cnt_checks*1.0/output[4],len(cnt)*1.0/output[5]
|
||
|
||
|
||
def update_real_change():
|
||
# 有多个author的commit的,只选出那些自己author的
|
||
|
||
count = 1
|
||
cursor.execute("select prj_id, pr_num, author from `pull-request`")
|
||
prs = cursor.fetchall()
|
||
for pr in prs:
|
||
prj_id, pr_num, pr_author = pr
|
||
|
||
if mutl_cmt_authors(prj_id, pr_num):
|
||
print count
|
||
count += 1
|
||
cursor.execute("select pr_id, additions, deletions, changed_files from pr_changes where repo_id=%s and pr_num=%s",
|
||
(prj_id, pr_num))
|
||
fs = cursor.fetchone()
|
||
pr_id, old_adds, old_dels, old_files = fs
|
||
|
||
r_f, r_a, r_d = _get_real_change(prj_id, pr_num,(old_adds, old_dels, old_files))
|
||
|
||
print old_files, old_adds, old_dels, r_f, r_a, r_d
|
||
cursor.execute("select user_name, repo_name from project where id=%s",(prj_id,))
|
||
un, rn = cursor.fetchone()
|
||
print "https://github.com/%s/%s/pull/%d"%(un, rn,pr_num)
|
||
cursor.execute("update pr_changes set additions=%s, deletions=%s, changed_files=%s where pr_id=%s ",
|
||
(r_a, r_d, r_f, pr_id))
|
||
conn.commit()
|
||
|
||
|
||
def commit_compare():
|
||
# 所有PR的统计信息
|
||
prs = dbop.select_all("select id, prj_id, pr_num,author from `pull-request`")
|
||
print len(prs)
|
||
commits = 0
|
||
for pr in prs:
|
||
pr_id, prj_id, pr_num, pr_author = pr
|
||
|
||
|
||
|
||
prs = dbop.select_all("select prj_id, pr_num from rq2_metrics where is_dup=1")
|
||
reviews, files, locs, cmt_checks, cnt_checks = 0,0,0,0,0
|
||
rw_authors = []
|
||
print len(prs)
|
||
cnt = set()
|
||
for pr in prs:
|
||
prj_id, pr_num = pr
|
||
cursor.execute("select id,author from `pull-request` where prj_id=%s and pr_num=%s", (prj_id, pr_num))
|
||
pr_id,pr_author = cursor.fetchone()
|
||
cnt.add(pr_author)
|
||
rw, fl, lc, cmt_cc, cnt_cc,rwer = pr_work(pr_id, prj_id, pr_num)
|
||
reviews += rw
|
||
rw_authors.extend(rwer)
|
||
files += fl
|
||
locs += lc
|
||
cmt_checks += cmt_cc
|
||
cnt_checks += cnt_cc
|
||
print len(cnt),files, locs,reviews, cmt_checks, cnt_checks,len(set(rw_authors))
|
||
print files*1.0/output[0], locs*1.0/output[1], reviews*1.0/output[2], cmt_checks*1.0/output[3],cnt_checks*1.0/output[4],len(cnt)*1.0/output[5]
|
||
|
||
import pandas as pd
|
||
def rq3_pr_size_stas():
|
||
|
||
cursor.execute("select id, repo_id, pr_num, files, churn from rq3_metrics")
|
||
result = cursor.fetchall()
|
||
|
||
files, locs = [],[]
|
||
for item in result:
|
||
pr_id, prj_id, pr_num, file_changed, loc = item
|
||
|
||
cursor.execute("select additions, deletions, changed_files from pr_changes where repo_id=%s and pr_num=%s",
|
||
(prj_id, pr_num))
|
||
fs = cursor.fetchone()
|
||
files.append(fs[2])
|
||
locs.append(fs[0] + fs[1])
|
||
|
||
ss = pd.Series(files)
|
||
nums = [ss.mean(),ss.std(),ss.median()]
|
||
for i in range(0,len(nums)):
|
||
ssn = nums[i]
|
||
if ssn > 100:
|
||
nums[i] = "%.1e"%ssn
|
||
else:
|
||
nums[i] = "%.2f"%ssn
|
||
s_min, s_max = ss.min(),ss.max()
|
||
if s_max > 100:
|
||
s_max = "%.1e"%s_max
|
||
print "\\texttt{%s}& %s &%s &%s&%s&%s\\\\"%("files\_changed",nums[0],nums[1],ss.min(),nums[2],s_max)
|
||
|
||
ss = pd.Series(locs)
|
||
nums = [ss.mean(),ss.std(),ss.median()]
|
||
for i in range(0,len(nums)):
|
||
ssn = nums[i]
|
||
if ssn > 100:
|
||
nums[i] = "%.1e"%ssn
|
||
else:
|
||
nums[i] = "%.2f"%ssn
|
||
s_min, s_max = ss.min(),ss.max()
|
||
if s_max > 100:
|
||
s_max = "%.1e"%s_max
|
||
print "\\texttt{%s}& %s &%s &%s&%s&%s\\\\"%("files\_changed",nums[0],nums[1],ss.min(),nums[2],s_max)
|
||
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# extract_pr_change()
|
||
# update_extract_pr_change()
|
||
# work_compare()
|
||
# update_rq2()
|
||
# update_real_change()
|
||
# commit_compare()
|
||
rq3_pr_size_stas()
|