duppr_analysis/experiment_code/fix_pair.py

95 lines
2.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#coding:utf-8
# 由于传递性造成的新的dup对是否在已有的数据集中
from tuple_pair import get_dups
import MySQLdb
import json
with open("_config.json") as fp:
config = json.load(fp)
local_db_config = config["local_db"]
conn = MySQLdb.connect(host=local_db_config["db_host"],user=local_db_config["db_user"],
passwd=local_db_config["db_passwd"],db=local_db_config["db_name"],port=3306,charset='utf8mb4')
cursor = conn.cursor()
def create_pair():
# 先加载duplciate表里已经有的pair
already_pairs = set()
cursor.execute("select prj_id, mst_pr, dup_pr from duplicate")
result = cursor.fetchall()
already_pairs = set(["%d-%d-%d"%(item[0], item[2], item[1]) for item in result])
count = 0
error_count = 0
new_pair = list()
# 用tuple创建pair
tuples = get_dups()
for item in tuples.items():
prj_id, _tuples = item
for _item in _tuples.items():
mst, dups = _item
if len(dups) > 1:
count += 1
print prj_id, mst, dups
ERROR = False
# create pairs
for dup in dups:
label = "%d-%d-%d"%(prj_id, mst, dup)
if label not in already_pairs:
# print "[",mst,dups,"]",label
# new_pair.append(label)
if not valid_pair(prj_id, mst, dup):
print "\t", mst, dup
ERROR = True
for i in range(0,len(dups)-1):
for j in range(i+1, len(dups)):
label = "%d-%d-%d"%(prj_id, dups[i], dups[j])
if label not in already_pairs:
# print "[",mst,dups,"]",label
# new_pair.append(label)*----
if not valid_pair(prj_id, dups[i], dups[j]):
print "\t", dups[i], dups[j]
ERROR = True
if ERROR:
error_count += 1
# print ">> find ", new_pair, " new pairs"
print count, error_count
return new_pair
def valid_pair(prj_id, mst_pr, dup_pr):
# 不是同一个人提交的
cursor.execute("select author,id from `pull-request` where prj_id=%s and pr_num=%s",(prj_id,mst_pr))
mst_author, mst_id = cursor.fetchone()
cursor.execute("select author,created_at,title,description from `pull-request` where prj_id=%s and pr_num=%s",(prj_id,dup_pr))
dup_author, dup_time, dup_title, dup_dsc = cursor.fetchone()
if mst_author == dup_author:
# print "same author"
return False
# 不知道相互的存在dup提交时没有引用mst且没有参与mst的讨论
if dup_title.find("%s"%mst_pr) != -1 or dup_dsc.find("%s"%mst_pr) != -1:
# print "ref mast"
return False
cursor.execute("select author,created_at from comment where pr_id=%s",(mst_id,))
dup_cmts = cursor.fetchall()
for cmt in dup_cmts:
cmt_author, cmt_time = cmt
if cmt_author == dup_author and cmt_time < dup_time:
# print "comment mast"
return False
return True
def work():
new_pairs = create_pair()
for pair in new_pairs:
prj_id, mst_pr, dup_pr = pair.split("-")
if not valid_pair(prj_id, mst_pr, dup_pr):
print pair
if __name__ == "__main__":
work()