354 lines
14 KiB
Python
354 lines
14 KiB
Python
|
|
from neo4j import GraphDatabase
|
|
import numpy as np
|
|
from datetime import datetime, timezone
|
|
import collections
|
|
# from doc2vec_model import sim
|
|
import ast
|
|
|
|
def get_labels(session,cnt):
|
|
with open('./id_table1000.txt', 'r', encoding='utf-8') as f:
|
|
id_table = eval(f.read())
|
|
labels = [[0 for i in range(196)] for j in range(574)]
|
|
results = session.run(
|
|
"MATCH (a:user)-[r]->(b:issue)-[:belongsto]->(c:repository) where c.name='numpy_numpy' and b.state='closed' and b.number < 1000 return a,b")
|
|
for res in results:
|
|
user = res.get('a')._properties['id']
|
|
issue = res.get('b')._properties['name']
|
|
labels[id_table[issue]][id_table[user]-cnt] = 1
|
|
labels = np.array(labels)
|
|
np.savetxt('./labels.txt', labels, fmt='%d')
|
|
|
|
def get_id_table(session):
|
|
|
|
id_table = dict()
|
|
id = 0
|
|
|
|
#id<10000,issue,closed,numpy 5071 id<3000,2342;id<500,574
|
|
results = session.run("MATCH (b:issue)-[:belongsto]->(c:repository) where c.name='numpy_numpy' and b.state='closed' and b.number < 1000 return b")
|
|
for res in results:
|
|
tmp = res.get('b')._properties['name']
|
|
# if tmp in id_table:
|
|
# continue
|
|
id_table[tmp] = id
|
|
id += 1
|
|
print(id)
|
|
#id<10000,issue,closed,numpy 3181 id<3000,671;id<500,196
|
|
results = session.run("MATCH (a:user)-[r]->(b:issue)-[:belongsto]->(c:repository) where c.name='numpy_numpy' and b.state='closed' and b.number < 1000 return distinct a")
|
|
for res in results:
|
|
tmp = res.get('a')._properties['id']
|
|
# if tmp in id_table:
|
|
# continue
|
|
id_table[tmp] = id
|
|
id += 1
|
|
print(id)
|
|
with open('./id_table1000.txt', 'w', encoding='utf-8') as f:
|
|
f.write(str(id_table))
|
|
|
|
def get_adj(session):
|
|
|
|
# with open('./id_table.txt', 'r', encoding='utf-8') as f:
|
|
with open('./id_table1000.txt', 'r', encoding='utf-8') as f:
|
|
id_table = eval(f.read())
|
|
adj = [[0 for i in range(len(id_table))] for j in range(len(id_table))]
|
|
# print(len(adj),len(adj[0]))
|
|
results = session.run(
|
|
"MATCH (a:repository)<-[:belongsto]-(b:issue)-[]->(c:issue)-[:belongsto]->(d:repository) "
|
|
"where a.name='numpy_numpy' and b.state='closed' and b.id<1000 "
|
|
"and d.name='numpy_numpy' and c.state='closed' and c.number <1000 return b,c")
|
|
for res in results:
|
|
fr = res.get('b')._properties['name']
|
|
to = res.get('c')._properties['name']
|
|
adj[id_table[fr]][id_table[to]] = 1
|
|
adj[id_table[to]][id_table[fr]] = 1
|
|
results = session.run("MATCH (a:user)-[r]->(b:issue)-[:belongsto]->(c:repository) "
|
|
"where c.name='numpy_numpy' and b.state='closed' and b.number <1000 return a,b")
|
|
for res in results:
|
|
fr = res.get('a')._properties['id']
|
|
to = res.get('b')._properties['name']
|
|
adj[id_table[fr]][id_table[to]] = 1
|
|
adj[id_table[to]][id_table[fr]] = 1
|
|
adj = np.array(adj)
|
|
# np.savetxt('./adj.txt',adj, fmt='%d')
|
|
np.savetxt('./adj1000.txt', adj, fmt='%d')
|
|
print('adj finished')
|
|
|
|
def get_t(session):
|
|
|
|
tbegin = datetime.strptime("2024-03-01T15:09:30Z", "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
|
|
tend = datetime.strptime("2000-03-01T15:09:30Z", "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
|
|
results = session.run(
|
|
"MATCH (a:user)-[r]->(b:issue)-[:belongsto]->(c:repository) "
|
|
"where b.state='closed' and c.name='numpy_numpy' and b.number <1000 return a,b,c,r")
|
|
for res in results:
|
|
tmp = datetime.strptime(res.get('r')._properties['time'], "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
|
|
if tmp > tend:
|
|
tend = tmp
|
|
if tbegin > tmp:
|
|
tbegin = tmp
|
|
return tbegin,tend
|
|
|
|
def get_c(session,te,tb,cnt):
|
|
|
|
with open('./id_table1000.txt', 'r', encoding='utf-8') as f:
|
|
id_table = eval(f.read())
|
|
|
|
id_list = [0 for i in range(len(id_table))]
|
|
for key in id_table:
|
|
id_list[id_table[key]] = key
|
|
adj = np.loadtxt('./adj1000.txt', dtype=int)
|
|
u_v = adj[cnt:,0:cnt] #user-issue邻接矩阵
|
|
|
|
c_6 = [[0 for i in range(u_v.shape[0])] for j in range(u_v.shape[0])]
|
|
c_7 = [[0 for i in range(u_v.shape[0])] for j in range(u_v.shape[0])]
|
|
c_8 = [[0 for i in range(u_v.shape[0])] for j in range(u_v.shape[0])]
|
|
c_9 = [[0 for i in range(u_v.shape[0])] for j in range(u_v.shape[0])]
|
|
x = 0
|
|
for i in range(u_v.shape[0]):
|
|
for j in range(i+1,u_v.shape[0]):
|
|
pos = np.nonzero(u_v[i]*u_v[j])
|
|
pos = list(*pos)
|
|
# print(pos)
|
|
if not pos:
|
|
continue
|
|
x += 1
|
|
time_list = dict()
|
|
print(len(pos),end=' ')
|
|
for k in pos:
|
|
time_list[id_list[k]] = collections.defaultdict(list)
|
|
results = session.run(
|
|
"MATCH (a:user)-[r]->(b:issue) "
|
|
"where a.id="+str(id_list[i+cnt]) + " and b.name='"+str(id_list[k])+"' return a,b,r")
|
|
for res in results:
|
|
time_list[id_list[k]][res.get('r').type].append(
|
|
datetime.strptime(res.get('r')._properties['time'], "%Y-%m-%dT%H:%M:%SZ").replace(
|
|
tzinfo=timezone.utc))
|
|
results = session.run(
|
|
"MATCH (a:user)-[r]->(b:issue) "
|
|
"where a.id=" + str(id_list[j + cnt]) + " and b.name='" + str(id_list[k]) + "' return a,b,r")
|
|
for res in results:
|
|
time_list[id_list[k]][res.get('r').type].append(
|
|
datetime.strptime(res.get('r')._properties['time'], "%Y-%m-%dT%H:%M:%SZ").replace(
|
|
tzinfo=timezone.utc))
|
|
#计算c(u,u)
|
|
# print(x, tuple([i, j]), end=' ')
|
|
value_6 = comepute_c(time_list,0.6,te,tb)
|
|
value_7 = comepute_c(time_list, 0.7,te,tb)
|
|
value_8 = comepute_c(time_list, 0.8,te,tb)
|
|
value_9 = comepute_c(time_list, 0.9,te,tb)
|
|
c_6[i][j] = value_6
|
|
c_7[i][j] = value_7
|
|
c_8[i][j] = value_8
|
|
c_9[i][j] = value_9
|
|
print(x, tuple([i, j, value_6]), end=' ')
|
|
c_6 = np.array(c_6)
|
|
np.savetxt('./c_6_1000.txt', c_6, fmt='%f')
|
|
c_7 = np.array(c_7)
|
|
np.savetxt('./c_7_1000.txt', c_7, fmt='%f')
|
|
c_8 = np.array(c_8)
|
|
np.savetxt('./c_8_1000.txt', c_8, fmt='%f')
|
|
c_9 = np.array(c_9)
|
|
np.savetxt('./c_9_1000.txt', c_9, fmt='%f')
|
|
def comepute_c(time_list,sigma,te,tb):
|
|
|
|
value = 0
|
|
for o in time_list:
|
|
value0 = 0
|
|
for a in time_list[o]:
|
|
times = sorted(time_list[o][a], reverse=True)
|
|
value1 = 0
|
|
for t in range(len(times)):
|
|
value1 += sigma ** (len(times) - t - 1) * (times[t] - tb) / (te - tb)
|
|
value0 += value1 / len(times)
|
|
value += value0 / len(time_list[o])
|
|
value = value / len(time_list)
|
|
return value
|
|
|
|
def get_r(session,te,tb,cnt):
|
|
|
|
with open('./id_table1000.txt', 'r', encoding='utf-8') as f:
|
|
id_table = eval(f.read())
|
|
|
|
id_list = [0 for i in range(len(id_table))]
|
|
for key in id_table:
|
|
id_list[id_table[key]] = key
|
|
adj = np.loadtxt('./adj1000.txt', dtype=int)
|
|
u_v = adj[cnt:,0:cnt] #user-issue邻接矩阵
|
|
# v_v = adj[0:cnt][0:cnt]
|
|
|
|
r_6 = [[0 for i in range(u_v.shape[1])] for j in range(u_v.shape[0])]
|
|
r_7 = [[0 for i in range(u_v.shape[1])] for j in range(u_v.shape[0])]
|
|
r_8 = [[0 for i in range(u_v.shape[1])] for j in range(u_v.shape[0])]
|
|
r_9 = [[0 for i in range(u_v.shape[1])] for j in range(u_v.shape[0])]
|
|
for i in range(u_v.shape[0]):
|
|
for j in range(u_v.shape[1]):
|
|
if not u_v[i][j]:
|
|
continue
|
|
time_list = dict()
|
|
time_list[id_list[j]] = collections.defaultdict(list)
|
|
results = session.run(
|
|
"MATCH (a:user)-[r]->(b:issue) "
|
|
"where a.id="+str(id_list[i+cnt]) + " and b.name='"+str(id_list[j])+"' return a,b,r")
|
|
for res in results:
|
|
time_list[id_list[j]][res.get('r').type].append(
|
|
datetime.strptime(res.get('r')._properties['time'], "%Y-%m-%dT%H:%M:%SZ").replace(
|
|
tzinfo=timezone.utc))
|
|
|
|
#计算r(u,u)
|
|
value_6 = comepute_c(time_list,0.6,te,tb)
|
|
value_7 = comepute_c(time_list, 0.7,te,tb)
|
|
value_8 = comepute_c(time_list, 0.8,te,tb)
|
|
value_9 = comepute_c(time_list, 0.9,te,tb)
|
|
r_6[i][j] = value_6
|
|
r_7[i][j] = value_7
|
|
r_8[i][j] = value_8
|
|
r_9[i][j] = value_9
|
|
print(tuple([i,j,value_6]),end=' ')
|
|
|
|
r_6 = np.array(r_6)
|
|
np.savetxt('./r_6_1000.txt', r_6, fmt='%f')
|
|
r_7 = np.array(r_7)
|
|
np.savetxt('./r_7_1000.txt', r_7, fmt='%f')
|
|
r_8 = np.array(r_8)
|
|
np.savetxt('./r_8_1000.txt', r_8, fmt='%f')
|
|
r_9 = np.array(r_9)
|
|
np.savetxt('./r_9_1000.txt', r_9, fmt='%f')
|
|
|
|
def get_r_old(session,u_v,sigma,te,tb): #u行v列
|
|
|
|
with open('./id_table.txt', 'r', encoding='utf-8') as f:
|
|
id_table = eval(f.read())
|
|
issue_set = set()
|
|
r = [[0 for i in range(len(u_v[0]))] for j in range(len(u_v))]
|
|
for key in id_table:
|
|
if not isinstance(key,int):
|
|
issue_set.add(key)
|
|
|
|
for issue in issue_set:
|
|
results = session.run(
|
|
"MATCH (a:user)-[r]->(b:issue|pr)-[:belongsto]-(c:repository) where c.name='numpy_numpy' and b.name="+str(issue)+" return a,b,r")
|
|
cnt = dict()
|
|
for res in results:
|
|
if not res.get('a')._properties['id'] in cnt:
|
|
cnt[res.get('a')._properties['id']] = dict()
|
|
if not res.get('r').type in cnt[res.get('a')._properties['id']]:
|
|
cnt[res.get('a')._properties['id']][res.get('r').type] = list()
|
|
cnt[res.get('a')._properties['id']][res.get('r').type].append(datetime.strptime(res.get('r')._properties['time'], "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc))
|
|
for i in cnt:
|
|
value = 0
|
|
for j in cnt[i]:
|
|
value1 = 0
|
|
times = sorted(cnt[i][j],reverse = True)
|
|
for t in times:
|
|
value1 += sigma**(len(times)-t-1)*(t-tb)/(te-tb)
|
|
value += value1/len(times)
|
|
value /= len(cnt[i])
|
|
r[id_table[i] - len(u_v[0])][id_table[issue]] = value
|
|
r[id_table[issue]][id_table[i] - len(u_v[0])] = value
|
|
with open('./r.txt', 'w', encoding='utf-8') as f:
|
|
f.write(str(r))
|
|
|
|
def get_s1(cnt):
|
|
|
|
adj = np.loadtxt('./adj1000.txt', dtype=int)
|
|
v_u = adj[0:cnt,cnt:]
|
|
s1 = [[0 for i in range(v_u.shape[0])] for j in range(v_u.shape[0])]
|
|
for i in range(len(v_u)):
|
|
for j in range(len(v_u)):
|
|
s1[i][j] = v_u[i]@v_u[j]
|
|
print(i,end=' ')
|
|
s1 = np.array(s1)
|
|
np.savetxt('./s1_1000.txt', s1, fmt='%f')
|
|
|
|
def get_s(session,v_v):
|
|
|
|
with open('./id_table.txt', 'r', encoding='utf-8') as f:
|
|
id_table = eval(f.read())
|
|
s1 = [[0 for i in range(len(v_v))]for j in range(len(v_v))]
|
|
s2 = [[0 for i in range(len(v_v))] for j in range(len(v_v))]
|
|
cnt = collections.defaultdict(set)
|
|
results = session.run(
|
|
"MATCH (a:user)-[]->(b:issue|pr)-[:belongsto]-(c:repository) where c.name='numpy_numpy' and b.state='closed' return a,b")
|
|
for res in results:
|
|
issue = res.get('b')._properties['name']
|
|
cnt[issue].add(res.get('b')._properties['id'])
|
|
for k1 in cnt:
|
|
for k2 in cnt:
|
|
if k1 == k2:
|
|
continue
|
|
s1[id_table[k1]][id_table[k2]] = len(cnt[k1].intersection(cnt[k2]))/len(cnt[k1].union(cnt[k2]))
|
|
s2 = sim(k1,k2)
|
|
|
|
with open('./s1.txt', 'w', encoding='utf-8') as f:
|
|
f.write(str(s1))
|
|
with open('./s2.txt', 'w', encoding='utf-8') as f:
|
|
f.write(str(s2))
|
|
|
|
def get_num():
|
|
|
|
adj = np.loadtxt('./adj1000.txt', dtype=int)
|
|
num = np.sum(adj, axis=1)
|
|
np.savetxt('./num1000.txt', num, fmt='%d')
|
|
#
|
|
# with open('./num.txt', 'w', encoding='utf-8') as f:
|
|
# f.write(str(list(num)))
|
|
|
|
def reprocess(filename):
|
|
|
|
data = np.loadtxt(filename, dtype=float)
|
|
for i in range(data.shape[0]):
|
|
for j in range(data.shape[1]):
|
|
if not data[i][j]:
|
|
data[i][j] = data[j][i]
|
|
np.savetxt('./data/'+filename, data, fmt='%f')
|
|
|
|
def get_index_pairs(cnt):
|
|
adj = np.loadtxt('./adj1000.txt', dtype=int)
|
|
adj = adj[0:cnt,cnt:]
|
|
one_index = []
|
|
zero_index = []
|
|
for row in range(len(adj)):
|
|
ones = np.where(adj[row] == 1)[0] # 找出1的位置
|
|
zeros = np.where(adj[row] == 0)[0] # 找出0的位置
|
|
for one in ones:
|
|
for zero in zeros:
|
|
one_index.append([row,one+cnt])
|
|
zero_index.append([row,zero+cnt])
|
|
one_index = np.array(one_index)
|
|
zero_index = np.array(zero_index)
|
|
# print(zero_index.shape,one_index.shape)
|
|
np.savetxt('./one_index_1000.txt', one_index, fmt='%d')
|
|
np.savetxt('./zero_index_1000.txt',zero_index, fmt='%d')
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "12345678"))
|
|
# session = driver.session()
|
|
#
|
|
# get_id_table(session)
|
|
# get_adj(session)
|
|
# tbegin,tend = get_t(session)
|
|
# get_num()
|
|
# #3000
|
|
# # tbegin = datetime.strptime("2012-06-05T04:40:41Z", "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
|
|
# # tend = datetime.strptime("2024-02-15T21:17:43Z", "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
|
|
# # # #10000
|
|
# # # tbegin = datetime.strptime("2012-06-05T04:40:41Z", "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
|
|
# # # tend = datetime.strptime("2024-02-15T21:17:43Z", "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
|
|
#
|
|
# # sigma 0.6 0.7 0.8 0.9
|
|
# get_c(session, tend, tbegin, 574)
|
|
# get_r(session, tend, tbegin,574)
|
|
#
|
|
# get_labels(session,574)
|
|
#
|
|
# driver.close()
|
|
#
|
|
# get_s1(574)
|
|
#
|
|
# reprocess('c_6_1000.txt')
|
|
# reprocess('c_7_1000.txt')
|
|
# reprocess('c_8_1000.txt')
|
|
# reprocess('c_9_1000.txt')
|
|
get_index_pairs(574) |