95 lines
2.8 KiB
Python
95 lines
2.8 KiB
Python
#coding:utf-8
|
||
# 由于传递性造成的新的dup对是否在已有的数据集中
|
||
|
||
from tuple_pair import get_dups
|
||
import MySQLdb
|
||
import json
|
||
with open("_config.json") as fp:
|
||
config = json.load(fp)
|
||
local_db_config = config["local_db"]
|
||
conn = MySQLdb.connect(host=local_db_config["db_host"],user=local_db_config["db_user"],
|
||
passwd=local_db_config["db_passwd"],db=local_db_config["db_name"],port=3306,charset='utf8mb4')
|
||
cursor = conn.cursor()
|
||
|
||
def create_pair():
|
||
# 先加载duplciate表里已经有的pair
|
||
already_pairs = set()
|
||
cursor.execute("select prj_id, mst_pr, dup_pr from duplicate")
|
||
result = cursor.fetchall()
|
||
already_pairs = set(["%d-%d-%d"%(item[0], item[2], item[1]) for item in result])
|
||
|
||
count = 0
|
||
error_count = 0
|
||
new_pair = list()
|
||
# 用tuple创建pair
|
||
tuples = get_dups()
|
||
for item in tuples.items():
|
||
prj_id, _tuples = item
|
||
for _item in _tuples.items():
|
||
mst, dups = _item
|
||
if len(dups) > 1:
|
||
count += 1
|
||
print prj_id, mst, dups
|
||
ERROR = False
|
||
# create pairs
|
||
for dup in dups:
|
||
label = "%d-%d-%d"%(prj_id, mst, dup)
|
||
if label not in already_pairs:
|
||
# print "[",mst,dups,"]",label
|
||
# new_pair.append(label)
|
||
if not valid_pair(prj_id, mst, dup):
|
||
print "\t", mst, dup
|
||
ERROR = True
|
||
|
||
for i in range(0,len(dups)-1):
|
||
for j in range(i+1, len(dups)):
|
||
label = "%d-%d-%d"%(prj_id, dups[i], dups[j])
|
||
if label not in already_pairs:
|
||
# print "[",mst,dups,"]",label
|
||
# new_pair.append(label)*----
|
||
if not valid_pair(prj_id, dups[i], dups[j]):
|
||
print "\t", dups[i], dups[j]
|
||
ERROR = True
|
||
if ERROR:
|
||
error_count += 1
|
||
|
||
# print ">> find ", new_pair, " new pairs"
|
||
print count, error_count
|
||
return new_pair
|
||
|
||
def valid_pair(prj_id, mst_pr, dup_pr):
|
||
# 不是同一个人提交的
|
||
cursor.execute("select author,id from `pull-request` where prj_id=%s and pr_num=%s",(prj_id,mst_pr))
|
||
mst_author, mst_id = cursor.fetchone()
|
||
cursor.execute("select author,created_at,title,description from `pull-request` where prj_id=%s and pr_num=%s",(prj_id,dup_pr))
|
||
dup_author, dup_time, dup_title, dup_dsc = cursor.fetchone()
|
||
|
||
if mst_author == dup_author:
|
||
# print "same author"
|
||
return False
|
||
|
||
# 不知道相互的存在(dup提交时没有引用mst,且没有参与mst的讨论)
|
||
if dup_title.find("%s"%mst_pr) != -1 or dup_dsc.find("%s"%mst_pr) != -1:
|
||
# print "ref mast"
|
||
return False
|
||
|
||
cursor.execute("select author,created_at from comment where pr_id=%s",(mst_id,))
|
||
dup_cmts = cursor.fetchall()
|
||
for cmt in dup_cmts:
|
||
cmt_author, cmt_time = cmt
|
||
if cmt_author == dup_author and cmt_time < dup_time:
|
||
# print "comment mast"
|
||
return False
|
||
|
||
return True
|
||
|
||
def work():
|
||
new_pairs = create_pair()
|
||
for pair in new_pairs:
|
||
prj_id, mst_pr, dup_pr = pair.split("-")
|
||
if not valid_pair(prj_id, mst_pr, dup_pr):
|
||
print pair
|
||
|
||
|
||
if __name__ == "__main__":
|
||
work() |