duppr_analysis/experiment_code/minor_revision/pr_change.py

505 lines
15 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding:utf-8
import dbop
dbop.init_pool(3)
import json
repo_maps = {}
with open("../prj_par.txt","r")as fp:
for line in fp.readlines():
ps = line.split("\t")
repo_maps[int(ps[0])] = int(ps[1])
import MySQLdb
import json
with open("../_config.json") as fp:
config = json.load(fp)
local_db_config = config["local_db"]
conn = MySQLdb.connect(host=local_db_config["db_host"],user=local_db_config["db_user"],
passwd=local_db_config["db_passwd"],db=local_db_config["db_name"],port=3306,charset='utf8mb4')
cursor = conn.cursor()
def extract_pr_change():
ch_sql = "insert into pr_changes (pr_id, repo_id, pr_num, additions, deletions, changed_files)values (%s,%s,%s,%s,%s,%s)"
# 取出所有pr
prs = dbop.select_all("select id, prj_id, pr_num from `pull-request`")
for pr in prs:
pr_id, prj_id, pr_num = pr
pr_json = dbop.select_one("select raw_json from pr_raw_json where repo_id=%s and pr_num=%s",
(repo_maps[prj_id], pr_num))
if pr_json is None:
print "no json", pr
continue
pr_json = json.loads(pr_json[0])
dbop.execute(ch_sql, (pr_id, prj_id, pr_num, pr_json["additions"], pr_json["deletions"], pr_json["changed_files"]))
def update_extract_pr_change():
prs = dbop.select_all("select pr_id, repo_id, pr_num from pr_changes where additions=0 and deletions=0 and changed_files=0")
print len(prs)
count = 0
for pr in prs:
print count
pr_id, prj_id, pr_num = pr
pr_json = dbop.select_one("select raw_json from pr_raw_json where repo_id=%s and pr_num=%s",
(repo_maps[prj_id], pr_num))
if pr_json is None or pr_json[0] is None or len(pr_json[0].strip()) == 0:
print "no json", pr
continue
print repo_maps[prj_id], pr_num
try:
pr_json = json.loads(pr_json[0])
ch_sql = "update pr_changes set additions=%s, deletions=%s, changed_files=%s where pr_id=%s "
dbop.execute(ch_sql, (pr_json["additions"], pr_json["deletions"], pr_json["changed_files"],pr_id))
except Exception,e:
print "error"
count += 1
def getLongestSubstring(str1,str2):
longest=0
start_pos1=-1
start_pos2=-1
compares=0 # 记录比较次数
for i in range(len(str1)):
for j in range(len(str2)):
length=0
m=i
n=j
while str1[m]==str2[n]:
compares+=1
length+=1
m+=1
n+=1
if (m>=len(str1))|(n>=len(str2)):
break
if longest<length:
compares+=1
longest=length
start_pos1=i
start_pos2=j
return longest
def mutl_cmt_authors(repo_id, pr_num):
cursor.execute("select raw_json from commit_raw_json where repo_id=%s and pr_num=%s",(repo_maps[repo_id], pr_num))
pr_cmts = cursor.fetchone()
if pr_cmts is None or pr_cmts[0] is None:
print "None commits"
return False
pr_cmts = json.loads(pr_cmts[0])
authors = set()
for prc in pr_cmts:
cmt_author = prc["commit"]["author"]["name"].lower()
authors.add(cmt_author)
if len(authors) >1:
return True
return False
def _parse_chunks(chunk_raw):
chunks = []
chunk_lns = chunk_raw.split("\n")
chunk_start_at = -1
for cln in chunk_lns:
if cln.startswith("@@"):
chunk_start_at = int(cln[3:].split(",")[0][1:])
else:
if cln.startswith("-") or cln.startswith("+"):
chunks.append((cln[0], chunk_start_at))
chunk_start_at += 1
return chunks
def _patch_merge(patches):
files, dels, adds = set(), {},{}
for patch in patches:
for fl in patch:
if "patch" not in fl:
continue
file_name, file_chunks = fl["filename"], fl["patch"]
files.add(file_name)
if file_name not in dels:
dels[file_name] = []
if file_name not in adds:
adds[file_name] = []
chunks = _parse_chunks(file_chunks)
for chl in chunks:
change_type = cnl[0]
change_line = cnl[1]
if change_type == "-":
# 先对比删除集合
already_dels = dels[file_name]
if change_line in already_dels:
pass
else:
already_dels.append(change_line)
for adl in range(0,len(already_dels)):
if already_dels[adl] < change_line:
dels[file_name][adl] -= 1
def _cmt_changes(cmts):
files, adds, dels = set(),0,0
for cmt in cmts:
cursor.execute("select info from not_real_cmts where sha=%s",(cmt,))
path = cursor.fetchone()
if path is None:
cursor.execute("select info from real_cmts where sha=%s",(cmt,))
path = cursor.fetchone()
if path is None:
continue
try:
sha_json = json.loads(path[0])["files"]
except Exception,e:
print "error json",cmt
return 0,0,0
for pf in sha_json:
if "filename" in pf:
files.add(pf["filename"])
adds += pf["additions"]
dels += pf["deletions"]
return len(files), adds, dels
def _get_real_change(repo_id, pr_num, old_changs):
# print "get real change", repo_id, pr_num
old_adds, old_dels, old_files = old_changs
files, adds, dels = set(),0,0
cmts, not_cmts = pr_real_commit(repo_id, pr_num)
print len(cmts), len(not_cmts)
r_f, r_a, r_d = _cmt_changes(cmts)
n_f, n_a, n_d = _cmt_changes(not_cmts)
print r_f, r_a, r_d, n_f, n_a, n_d
if r_f > old_files:
r_f = old_files
if len(cmts) <= 1:
if r_a > old_adds:
r_a = old_adds
if r_d > old_dels:
r_d = old_dels
return r_f, r_a, r_d
else:
if n_a < old_adds:
r_a = old_adds - n_a
if n_d < old_dels:
r_d = old_dels - n_d
if r_a > old_adds:
r_a = old_adds
if r_d > old_dels:
r_d = old_dels
return r_f, r_a, r_d
def pr_real_commit(repo_id, pr_num):
cursor.execute("select user_name, repo_name from project where id=%s",(repo_id,))
un, rn = cursor.fetchone()
cursor.execute("select author from `pull-request` where prj_id=%s and pr_num=%s",(repo_id, pr_num))
pr_author = cursor.fetchone()[0]
cursor.execute("select user_name from allrepo_users where name=%s",(pr_author,))
pr_author_name = cursor.fetchone()
if pr_author_name is not None:
pr_author_name = pr_author_name[0]
pr_author = pr_author.lower()
if pr_author_name is not None:
pr_author_name = pr_author_name.lower()
cursor.execute("select raw_json from commit_raw_json where repo_id=%s and pr_num=%s",(repo_maps[repo_id], pr_num))
pr_cmts = cursor.fetchone()
if pr_cmts is None or pr_cmts[0] is None:
pass
# print "None commits"
pr_cmts = json.loads(pr_cmts[0])
# print "total commits:", len(pr_cmts)
authors = set()
real_authors = set()
cmts = []
not_cmts = []
for prc in pr_cmts:
cmt_author = prc["commit"]["author"]["name"].lower()
authors.add(cmt_author)
FLAG = False
if cmt_author == pr_author:
FLAG = True
if pr_author_name is not None and pr_author_name == cmt_author:
FLAG = True
if cmt_author.find(pr_author)!= -1:
FLAG = True
if pr_author_name is not None and cmt_author.find(pr_author_name)!= -1:
FLAG = True
if pr_author_name is not None and pr_author_name.find(cmt_author)!= -1:
FLAG = True
# 最大字符串匹配
len_th = 3
if getLongestSubstring(cmt_author, pr_author) > len_th or (pr_author_name is not None and getLongestSubstring(cmt_author,pr_author) > len_th):
FLAG = True
if FLAG:
real_authors.add(cmt_author)
cmts.append(prc["sha"])
else:
not_cmts.append(prc["sha"])
# print prc["sha"],cmt_author, pr_author, pr_author_name
# get_commit(prc["sha"], repo_id)
# print "https://github.com/%s/%s/pull/%d"%(un, rn, pr_num), len(pr_cmts), len(authors)
# print "\t pr_author:", pr_author, pr_author_name
# print "\t cmt authors:", authors
# print "\t cmt real-authors:", real_authors
# print "*******"
return cmts,not_cmts
def update_rq2():
cursor.execute("select id, prj_id, pr_num, file_changed, loc from rq2_metrics")
result = cursor.fetchall()
for item in result:
pr_id, prj_id, pr_num, file_changed, loc = item
# 先判断是不是多个author
if mutl_cmt_authors(prj_id, pr_num):
files, locs = _get_real_change(prj_id, pr_num)
else:
cursor.execute("select additions, deletions, changed_files from pr_changes where repo_id=%s and pr_num=%s",
(PRJ_ID, dup_pr))
fs = cursor.fetchone()
files = fs[2]
locs = fs[0] + fs[1]
# 更新
cursor.execute("update rq2_metrics set file_changes=%, loc=%s where id=%s",
(files, locs, pr_id))
conn.commit()
print file_changed, loc, files, locs
def update_rq3():
cursor.execute("select id, repo_id, pr_num, files, churn from rq3_metrics")
result = cursor.fetchall()
for item in result:
pr_id, prj_id, pr_num, file_changed, loc = item
# 先判断是不是多个author
if mutl_cmt_authors(prj_id, pr_num):
files, locs = _get_real_change(prj_id, pr_num)
else:
cursor.execute("select additions, deletions, changed_files from pr_changes where repo_id=%s and pr_num=%s",
(PRJ_ID, dup_pr))
fs = cursor.fetchone()
files = fs[2]
locs = fs[0] + fs[1]
# 更新
cursor.execute("update rq3_metrics set files=%, churn=%s where id=%s",
(files, locs, pr_id))
conn.commit()
print file_changed, loc, files, locs
def mutl_cmt_authors(repo_id, pr_num):
cursor.execute("select raw_json from commit_raw_json where repo_id=%s and pr_num=%s",(repo_maps[repo_id], pr_num))
pr_cmts = cursor.fetchone()
if pr_cmts is None or pr_cmts[0] is None:
print "None commits"
return False
pr_cmts = json.loads(pr_cmts[0])
authors = set()
for prc in pr_cmts:
cmt_author = prc["commit"]["author"]["name"].lower()
authors.add(cmt_author)
if len(authors) >1:
return True
return False
def pr_work(pr_id, prj_id, pr_num):
# 是否混入了别人的commit这个时候先不管了 都不区分了
# reviews
pr_author = dbop.select_one("select author from `pull-request` where id=%s",(pr_id,))[0]
result = dbop.select_all("select id,author from comment where pr_id=%s",(pr_id,))
rw = len(result)
authors = [item[1] for item in result if item[1]!=pr_author]
# files
fl, lc = 0, 0
# # 如果去掉混入多个author的呢
# if mutl_cmt_authors(prj_id, pr_num):
# pass
# else:
# result = dbop.select_one("select changed_files, additions, deletions from pr_changes where pr_id=%s",(pr_id,))
# if result is not None:
# fl += result[0]
# lc += result[1] + result[2]
# else:
# fl, lc = 0,0
# check
cursor.execute("select ci_status from pr_ci where repo_id=%s and pr_num=%s",
(prj_id, pr_num))
checks = cursor.fetchall()
cmt_cc, cnt_cc = 0,0
for check in checks:
check = json.loads(check[0])
if len(check["statuses"]) != 0:
cmt_cc += 1
cnt_cc += len(check["statuses"])
return rw, fl, lc, cmt_cc, cnt_cc ,authors
def work_compare():
# 所有PR的统计信息
prs = dbop.select_all("select id, prj_id, pr_num,author from `pull-request`")
reviews, files, locs, cmt_checks, cnt_checks = 0,0,0,0,0
cnt = set()
rw_authors = []
print len(prs)
for pr in prs:
pr_id, prj_id, pr_num, pr_author = pr
cnt.add(pr_author)
rw, fl, lc, cmt_cc, cnt_cc,rwer = pr_work(pr_id, prj_id, pr_num)
reviews += rw
rw_authors.extend(rwer)
# print pr_id, rw, reviews
files += fl
locs += lc
cmt_checks += cmt_cc
cnt_checks += cnt_cc
print len(cnt), files, locs, reviews, cmt_checks, cnt_checks, len(set(rw_authors))
output = (files, locs, reviews, cmt_checks, cnt_checks,len(cnt))
prs = dbop.select_all("select prj_id, pr_num from rq2_metrics where is_dup=1")
reviews, files, locs, cmt_checks, cnt_checks = 0,0,0,0,0
rw_authors = []
print len(prs)
cnt = set()
for pr in prs:
prj_id, pr_num = pr
cursor.execute("select id,author from `pull-request` where prj_id=%s and pr_num=%s", (prj_id, pr_num))
pr_id,pr_author = cursor.fetchone()
cnt.add(pr_author)
rw, fl, lc, cmt_cc, cnt_cc,rwer = pr_work(pr_id, prj_id, pr_num)
reviews += rw
rw_authors.extend(rwer)
files += fl
locs += lc
cmt_checks += cmt_cc
cnt_checks += cnt_cc
print len(cnt),files, locs,reviews, cmt_checks, cnt_checks,len(set(rw_authors))
print files*1.0/output[0], locs*1.0/output[1], reviews*1.0/output[2], cmt_checks*1.0/output[3],cnt_checks*1.0/output[4],len(cnt)*1.0/output[5]
def update_real_change():
# 有多个author的commit的只选出那些自己author的
count = 1
cursor.execute("select prj_id, pr_num, author from `pull-request`")
prs = cursor.fetchall()
for pr in prs:
prj_id, pr_num, pr_author = pr
if mutl_cmt_authors(prj_id, pr_num):
print count
count += 1
cursor.execute("select pr_id, additions, deletions, changed_files from pr_changes where repo_id=%s and pr_num=%s",
(prj_id, pr_num))
fs = cursor.fetchone()
pr_id, old_adds, old_dels, old_files = fs
r_f, r_a, r_d = _get_real_change(prj_id, pr_num,(old_adds, old_dels, old_files))
print old_files, old_adds, old_dels, r_f, r_a, r_d
cursor.execute("select user_name, repo_name from project where id=%s",(prj_id,))
un, rn = cursor.fetchone()
print "https://github.com/%s/%s/pull/%d"%(un, rn,pr_num)
cursor.execute("update pr_changes set additions=%s, deletions=%s, changed_files=%s where pr_id=%s ",
(r_a, r_d, r_f, pr_id))
conn.commit()
def commit_compare():
# 所有PR的统计信息
prs = dbop.select_all("select id, prj_id, pr_num,author from `pull-request`")
print len(prs)
commits = 0
for pr in prs:
pr_id, prj_id, pr_num, pr_author = pr
prs = dbop.select_all("select prj_id, pr_num from rq2_metrics where is_dup=1")
reviews, files, locs, cmt_checks, cnt_checks = 0,0,0,0,0
rw_authors = []
print len(prs)
cnt = set()
for pr in prs:
prj_id, pr_num = pr
cursor.execute("select id,author from `pull-request` where prj_id=%s and pr_num=%s", (prj_id, pr_num))
pr_id,pr_author = cursor.fetchone()
cnt.add(pr_author)
rw, fl, lc, cmt_cc, cnt_cc,rwer = pr_work(pr_id, prj_id, pr_num)
reviews += rw
rw_authors.extend(rwer)
files += fl
locs += lc
cmt_checks += cmt_cc
cnt_checks += cnt_cc
print len(cnt),files, locs,reviews, cmt_checks, cnt_checks,len(set(rw_authors))
print files*1.0/output[0], locs*1.0/output[1], reviews*1.0/output[2], cmt_checks*1.0/output[3],cnt_checks*1.0/output[4],len(cnt)*1.0/output[5]
import pandas as pd
def rq3_pr_size_stas():
cursor.execute("select id, repo_id, pr_num, files, churn from rq3_metrics")
result = cursor.fetchall()
files, locs = [],[]
for item in result:
pr_id, prj_id, pr_num, file_changed, loc = item
cursor.execute("select additions, deletions, changed_files from pr_changes where repo_id=%s and pr_num=%s",
(prj_id, pr_num))
fs = cursor.fetchone()
files.append(fs[2])
locs.append(fs[0] + fs[1])
ss = pd.Series(files)
nums = [ss.mean(),ss.std(),ss.median()]
for i in range(0,len(nums)):
ssn = nums[i]
if ssn > 100:
nums[i] = "%.1e"%ssn
else:
nums[i] = "%.2f"%ssn
s_min, s_max = ss.min(),ss.max()
if s_max > 100:
s_max = "%.1e"%s_max
print "\\texttt{%s}& %s &%s &%s&%s&%s\\\\"%("files\_changed",nums[0],nums[1],ss.min(),nums[2],s_max)
ss = pd.Series(locs)
nums = [ss.mean(),ss.std(),ss.median()]
for i in range(0,len(nums)):
ssn = nums[i]
if ssn > 100:
nums[i] = "%.1e"%ssn
else:
nums[i] = "%.2f"%ssn
s_min, s_max = ss.min(),ss.max()
if s_max > 100:
s_max = "%.1e"%s_max
print "\\texttt{%s}& %s &%s &%s&%s&%s\\\\"%("files\_changed",nums[0],nums[1],ss.min(),nums[2],s_max)
if __name__ == "__main__":
# extract_pr_change()
# update_extract_pr_change()
# work_compare()
# update_rq2()
# update_real_change()
# commit_compare()
rq3_pr_size_stas()