duppr_analysis/experiment_code/minor_revision/pr_change.py

# coding:utf-8
import dbop
dbop.init_pool(3)
import json

repo_maps = {}
with open("../prj_par.txt","r")as fp:
	for line in fp.readlines():
		ps = line.split("\t")
		repo_maps[int(ps[0])] = int(ps[1])

import MySQLdb
import json
with open("../_config.json") as fp:
	config = json.load(fp)
local_db_config = config["local_db"]
conn = MySQLdb.connect(host=local_db_config["db_host"],user=local_db_config["db_user"],
									passwd=local_db_config["db_passwd"],db=local_db_config["db_name"],port=3306,charset='utf8mb4')
cursor = conn.cursor()


def extract_pr_change():
	ch_sql = "insert into pr_changes (pr_id, repo_id, pr_num,  additions, deletions, changed_files)values (%s,%s,%s,%s,%s,%s)"
	# 取出所有pr
	prs = dbop.select_all("select id, prj_id, pr_num from `pull-request`")
	for pr in prs:
		pr_id, prj_id, pr_num = pr
		pr_json = dbop.select_one("select raw_json from pr_raw_json where repo_id=%s and pr_num=%s",
					(repo_maps[prj_id], pr_num))
		if pr_json is None:
			print "no json", pr
			continue
		pr_json = json.loads(pr_json[0])

		dbop.execute(ch_sql, (pr_id, prj_id, pr_num, pr_json["additions"], pr_json["deletions"], pr_json["changed_files"]))

def update_extract_pr_change():
	prs = dbop.select_all("select pr_id, repo_id, pr_num from pr_changes where additions=0 and deletions=0 and changed_files=0")
	print len(prs)
	count = 0
	for pr in prs:
		print count
		pr_id, prj_id, pr_num = pr
		pr_json = dbop.select_one("select raw_json from pr_raw_json where repo_id=%s and pr_num=%s",
					(repo_maps[prj_id], pr_num))
		if pr_json is None or pr_json[0] is None or len(pr_json[0].strip()) == 0:
			print "no json", pr
			continue
		print repo_maps[prj_id], pr_num
		try:
			pr_json = json.loads(pr_json[0])
			ch_sql = "update  pr_changes set additions=%s, deletions=%s, changed_files=%s where pr_id=%s "
			dbop.execute(ch_sql, (pr_json["additions"], pr_json["deletions"], pr_json["changed_files"],pr_id))
		except Exception,e:
			print "error"
		count += 1

def getLongestSubstring(str1,str2):
    longest=0
    start_pos1=-1
    start_pos2=-1
    compares=0   # 记录比较次数

    for i in range(len(str1)):
        for j in range(len(str2)):
            length=0
            m=i
            n=j
            while str1[m]==str2[n]:
                compares+=1
                length+=1
                m+=1
                n+=1
                if (m>=len(str1))|(n>=len(str2)):
                    break
            if longest<length:
                compares+=1
                longest=length
                start_pos1=i
                start_pos2=j
    return longest


def mutl_cmt_authors(repo_id, pr_num):
	cursor.execute("select raw_json from commit_raw_json where repo_id=%s and pr_num=%s",(repo_maps[repo_id], pr_num))
	pr_cmts = cursor.fetchone()
	if pr_cmts is None or pr_cmts[0] is None:
		print "None commits"
		return False
	pr_cmts = json.loads(pr_cmts[0])
	authors = set()
	for prc in pr_cmts:
		cmt_author = prc["commit"]["author"]["name"].lower()
		authors.add(cmt_author)
	if len(authors) >1:
		return True
	return False


def _parse_chunks(chunk_raw):
	chunks = []
	chunk_lns = chunk_raw.split("\n")
	chunk_start_at = -1
	for cln in chunk_lns:
		if cln.startswith("@@"):
			chunk_start_at = int(cln[3:].split(",")[0][1:])
		else:
			if cln.startswith("-") or cln.startswith("+"):
				chunks.append((cln[0], chunk_start_at))
			chunk_start_at += 1
	return chunks


def _patch_merge(patches):
	files, dels, adds = set(), {},{}
	for patch in patches:
		for fl in patch:
			if "patch" not in fl:
				continue
			file_name, file_chunks = fl["filename"], fl["patch"]
			files.add(file_name)
			if file_name not in dels:
				dels[file_name] = []
			if file_name not in adds:
				adds[file_name] = []
			chunks = _parse_chunks(file_chunks)

			for chl in chunks:
				change_type = cnl[0]
				change_line = cnl[1]


				if change_type == "-":

					# 先对比删除集合
					already_dels = dels[file_name]
					if change_line in already_dels:
						pass
					else:
						already_dels.append(change_line)
					for adl in range(0,len(already_dels)):
						if already_dels[adl] < change_line:
							dels[file_name][adl] -= 1


def _cmt_changes(cmts):
	files, adds, dels = set(),0,0
	for cmt in cmts:
		cursor.execute("select info from not_real_cmts where sha=%s",(cmt,))
		path = cursor.fetchone()
		if path is None:
			cursor.execute("select info from real_cmts where sha=%s",(cmt,))
			path = cursor.fetchone()
			if path is None:
				continue
		try:
			sha_json = json.loads(path[0])["files"]
		except Exception,e:
			print "error json",cmt
			return 0,0,0
		for pf in sha_json:
			if "filename" in pf:
				files.add(pf["filename"])
			adds += pf["additions"]
			dels += pf["deletions"]
	return len(files), adds, dels
def _get_real_change(repo_id, pr_num, old_changs):
	# print "get real change", repo_id, pr_num
	old_adds, old_dels, old_files = old_changs
	files, adds, dels = set(),0,0
	cmts, not_cmts = pr_real_commit(repo_id, pr_num)
	print len(cmts), len(not_cmts)

	r_f, r_a, r_d  = _cmt_changes(cmts)
	n_f, n_a, n_d = _cmt_changes(not_cmts)
	print r_f, r_a, r_d, n_f, n_a, n_d

	if r_f > old_files:
		r_f = old_files
	if len(cmts) <= 1:
		if r_a > old_adds:
			r_a = old_adds
		if r_d > old_dels:
			r_d = old_dels
		return r_f, r_a, r_d
	else:
		if n_a < old_adds:
			r_a = old_adds - n_a
		if n_d < old_dels:
			r_d = old_dels - n_d

		if r_a > old_adds:
			r_a = old_adds
		if r_d > old_dels:
			r_d = old_dels

	return r_f, r_a, r_d

def pr_real_commit(repo_id, pr_num):
	cursor.execute("select user_name, repo_name from project where id=%s",(repo_id,))
	un, rn = cursor.fetchone()
	cursor.execute("select author from `pull-request` where prj_id=%s and pr_num=%s",(repo_id, pr_num))
	pr_author = cursor.fetchone()[0]
	cursor.execute("select user_name from allrepo_users where name=%s",(pr_author,))
	pr_author_name = cursor.fetchone()
	if pr_author_name is not None:
		pr_author_name = pr_author_name[0]

	pr_author = pr_author.lower()
	if pr_author_name is not None:
		pr_author_name = pr_author_name.lower()

	cursor.execute("select raw_json from commit_raw_json where repo_id=%s and pr_num=%s",(repo_maps[repo_id], pr_num))
	pr_cmts = cursor.fetchone()
	if pr_cmts is None or pr_cmts[0] is None:
		pass
		# print "None commits"
	pr_cmts = json.loads(pr_cmts[0])
	# print "total commits:", len(pr_cmts)
	authors = set()
	real_authors = set()
	cmts = []
	not_cmts = []
	for prc in pr_cmts:
		cmt_author = prc["commit"]["author"]["name"].lower()
		authors.add(cmt_author)
		FLAG = False
		if cmt_author == pr_author:
			FLAG = True
		if pr_author_name is not None and pr_author_name == cmt_author:
			FLAG = True
		if cmt_author.find(pr_author)!= -1:
			FLAG = True
		if pr_author_name is not None and cmt_author.find(pr_author_name)!= -1:
			FLAG = True
		if pr_author_name is not None and pr_author_name.find(cmt_author)!= -1:
			FLAG = True

		# 最大字符串匹配
		len_th = 3
		if getLongestSubstring(cmt_author, pr_author) > len_th or (pr_author_name is not None and getLongestSubstring(cmt_author,pr_author) > len_th):
			FLAG = True

		if FLAG:
			real_authors.add(cmt_author)
			cmts.append(prc["sha"])
		else:
			not_cmts.append(prc["sha"])
			# print prc["sha"],cmt_author, pr_author, pr_author_name
			# get_commit(prc["sha"], repo_id)

	# print "https://github.com/%s/%s/pull/%d"%(un, rn, pr_num), len(pr_cmts), len(authors)
	# print "\t pr_author:", pr_author, pr_author_name
	# print "\t cmt authors:", authors
	# print "\t cmt real-authors:", real_authors

	# print "*******"
	return cmts,not_cmts

def update_rq2():
	cursor.execute("select id, prj_id, pr_num, file_changed, loc from rq2_metrics")
	result = cursor.fetchall()

	for item in result:
		pr_id, prj_id, pr_num, file_changed, loc  = item
		# 先判断是不是多个author
		if mutl_cmt_authors(prj_id, pr_num):
			files, locs = _get_real_change(prj_id, pr_num)
		else:
			cursor.execute("select additions, deletions, changed_files from pr_changes where repo_id=%s and pr_num=%s",
						(PRJ_ID, dup_pr))
			fs = cursor.fetchone()
			files = fs[2]
			locs = fs[0] + fs[1]

		# 更新
		cursor.execute("update rq2_metrics set file_changes=%, loc=%s where id=%s",
		(files, locs, pr_id))
		conn.commit()
		print file_changed, loc, files, locs

def update_rq3():
	cursor.execute("select id, repo_id, pr_num, files, churn from rq3_metrics")
	result = cursor.fetchall()

	for item in result:
		pr_id, prj_id, pr_num, file_changed, loc  = item
		# 先判断是不是多个author
		if mutl_cmt_authors(prj_id, pr_num):
			files, locs = _get_real_change(prj_id, pr_num)
		else:
			cursor.execute("select additions, deletions, changed_files from pr_changes where repo_id=%s and pr_num=%s",
						(PRJ_ID, dup_pr))
			fs = cursor.fetchone()
			files = fs[2]
			locs = fs[0] + fs[1]

		# 更新
		cursor.execute("update rq3_metrics set files=%, churn=%s where id=%s",
		(files, locs, pr_id))
		conn.commit()
		print file_changed, loc, files, locs
def mutl_cmt_authors(repo_id, pr_num):
	cursor.execute("select raw_json from commit_raw_json where repo_id=%s and pr_num=%s",(repo_maps[repo_id], pr_num))
	pr_cmts = cursor.fetchone()
	if pr_cmts is None or pr_cmts[0] is None:
		print "None commits"
		return False
	pr_cmts = json.loads(pr_cmts[0])
	authors = set()
	for prc in pr_cmts:
		cmt_author = prc["commit"]["author"]["name"].lower()
		authors.add(cmt_author)
	if len(authors) >1:
		return True
	return False

def pr_work(pr_id, prj_id, pr_num):
	# 是否混入了别人的commit，这个时候先不管了， 都不区分了

	# reviews
	pr_author = dbop.select_one("select author from `pull-request` where id=%s",(pr_id,))[0]

	result = dbop.select_all("select id,author from comment where pr_id=%s",(pr_id,))
	rw = len(result)
	authors = [item[1] for item in result if item[1]!=pr_author]

	# files
	fl, lc = 0, 0
	# # 如果去掉混入多个author的呢？
	# if mutl_cmt_authors(prj_id, pr_num):
	# 	pass
	# else:
	# 	result = dbop.select_one("select changed_files, additions, deletions from pr_changes where pr_id=%s",(pr_id,))
	# 	if result is not None:
	# 		fl += result[0]
	# 		lc += result[1] + result[2]
	# 	else:
	# 		fl, lc = 0,0

	# check
	cursor.execute("select ci_status from pr_ci where repo_id=%s and pr_num=%s",
		(prj_id, pr_num))
	checks = cursor.fetchall()
	cmt_cc, cnt_cc = 0,0
	for check in checks:
		check = json.loads(check[0])
		if len(check["statuses"]) != 0:
			cmt_cc += 1
		cnt_cc += len(check["statuses"])
	return rw, fl, lc, cmt_cc, cnt_cc ,authors

def work_compare():
	# 所有PR的统计信息
	prs = dbop.select_all("select id, prj_id, pr_num,author from `pull-request`")
	reviews, files, locs, cmt_checks, cnt_checks = 0,0,0,0,0
	cnt = set()
	rw_authors = []
	print len(prs)
	for pr in prs:
		pr_id, prj_id, pr_num, pr_author = pr
		cnt.add(pr_author)
		rw, fl, lc, cmt_cc, cnt_cc,rwer = pr_work(pr_id, prj_id, pr_num)
		reviews += rw
		rw_authors.extend(rwer)
		# print pr_id, rw, reviews
		files += fl
		locs += lc
		cmt_checks += cmt_cc
		cnt_checks += cnt_cc
	print len(cnt), files, locs, reviews, cmt_checks, cnt_checks, len(set(rw_authors))
	output = (files, locs, reviews, cmt_checks, cnt_checks,len(cnt))

	prs = dbop.select_all("select prj_id, pr_num from rq2_metrics where is_dup=1")
	reviews, files, locs, cmt_checks, cnt_checks = 0,0,0,0,0
	rw_authors = []
	print len(prs)
	cnt = set()
	for pr in prs:
		prj_id, pr_num = pr
		cursor.execute("select id,author from `pull-request` where prj_id=%s and pr_num=%s", (prj_id, pr_num))
		pr_id,pr_author = cursor.fetchone()
		cnt.add(pr_author)
		rw, fl, lc, cmt_cc, cnt_cc,rwer = pr_work(pr_id, prj_id, pr_num)
		reviews += rw
		rw_authors.extend(rwer)
		files += fl
		locs += lc
		cmt_checks += cmt_cc
		cnt_checks += cnt_cc
	print len(cnt),files, locs,reviews,  cmt_checks, cnt_checks,len(set(rw_authors))
	print files*1.0/output[0], locs*1.0/output[1], reviews*1.0/output[2], cmt_checks*1.0/output[3],cnt_checks*1.0/output[4],len(cnt)*1.0/output[5]


def update_real_change():
	# 有多个author的commit的，只选出那些自己author的

	count = 1
	cursor.execute("select prj_id, pr_num, author from `pull-request`")
	prs = cursor.fetchall()
	for pr in prs:
		prj_id, pr_num, pr_author = pr

		if mutl_cmt_authors(prj_id, pr_num):
			print count
			count += 1
			cursor.execute("select pr_id, additions, deletions, changed_files from pr_changes where repo_id=%s and pr_num=%s",
						(prj_id, pr_num))
			fs = cursor.fetchone()
			pr_id, old_adds, old_dels, old_files = fs

			r_f, r_a, r_d  = _get_real_change(prj_id, pr_num,(old_adds, old_dels, old_files))

			print old_files, old_adds, old_dels, r_f, r_a, r_d
			cursor.execute("select user_name, repo_name from project where id=%s",(prj_id,))
			un, rn = cursor.fetchone()
			print "https://github.com/%s/%s/pull/%d"%(un, rn,pr_num)
			cursor.execute("update pr_changes set additions=%s, deletions=%s, changed_files=%s where pr_id=%s ",
						(r_a, r_d, r_f, pr_id))
		conn.commit()


def commit_compare():
	# 所有PR的统计信息
	prs = dbop.select_all("select id, prj_id, pr_num,author from `pull-request`")
	print len(prs)
	commits = 0
	for pr in prs:
		pr_id, prj_id, pr_num, pr_author = pr


	prs = dbop.select_all("select prj_id, pr_num from rq2_metrics where is_dup=1")
	reviews, files, locs, cmt_checks, cnt_checks = 0,0,0,0,0
	rw_authors = []
	print len(prs)
	cnt = set()
	for pr in prs:
		prj_id, pr_num = pr
		cursor.execute("select id,author from `pull-request` where prj_id=%s and pr_num=%s", (prj_id, pr_num))
		pr_id,pr_author = cursor.fetchone()
		cnt.add(pr_author)
		rw, fl, lc, cmt_cc, cnt_cc,rwer = pr_work(pr_id, prj_id, pr_num)
		reviews += rw
		rw_authors.extend(rwer)
		files += fl
		locs += lc
		cmt_checks += cmt_cc
		cnt_checks += cnt_cc
	print len(cnt),files, locs,reviews,  cmt_checks, cnt_checks,len(set(rw_authors))
	print files*1.0/output[0], locs*1.0/output[1], reviews*1.0/output[2], cmt_checks*1.0/output[3],cnt_checks*1.0/output[4],len(cnt)*1.0/output[5]

import pandas as pd
def rq3_pr_size_stas():

	cursor.execute("select id, repo_id, pr_num, files, churn from rq3_metrics")
	result = cursor.fetchall()

	files, locs = [],[]
	for item in result:
		pr_id, prj_id, pr_num, file_changed, loc  = item

		cursor.execute("select additions, deletions, changed_files from pr_changes where repo_id=%s and pr_num=%s",
						(prj_id, pr_num))
		fs = cursor.fetchone()
		files.append(fs[2])
		locs.append(fs[0] + fs[1])

	ss = pd.Series(files)
	nums = [ss.mean(),ss.std(),ss.median()]
	for i in range(0,len(nums)):
		ssn = nums[i]
		if ssn > 100:
			nums[i] = "%.1e"%ssn
		else:
			nums[i] = "%.2f"%ssn
	s_min, s_max = ss.min(),ss.max()
	if s_max > 100:
		s_max = "%.1e"%s_max
	print  "\\texttt{%s}& %s &%s &%s&%s&%s\\\\"%("files\_changed",nums[0],nums[1],ss.min(),nums[2],s_max)

	ss = pd.Series(locs)
	nums = [ss.mean(),ss.std(),ss.median()]
	for i in range(0,len(nums)):
		ssn = nums[i]
		if ssn > 100:
			nums[i] = "%.1e"%ssn
		else:
			nums[i] = "%.2f"%ssn
	s_min, s_max = ss.min(),ss.max()
	if s_max > 100:
		s_max = "%.1e"%s_max
	print  "\\texttt{%s}& %s &%s &%s&%s&%s\\\\"%("files\_changed",nums[0],nums[1],ss.min(),nums[2],s_max)


if __name__ == "__main__":
	# extract_pr_change()
	# update_extract_pr_change()
	# work_compare()
	# update_rq2()
	# update_real_change()
	# commit_compare()
	rq3_pr_size_stas()