github_mongoDB_dataprocess/prepare_for_gat.py

354 lines
14 KiB
Python

from neo4j import GraphDatabase
import numpy as np
from datetime import datetime, timezone
import collections
# from doc2vec_model import sim
import ast
def get_labels(session,cnt):
with open('./id_table1000.txt', 'r', encoding='utf-8') as f:
id_table = eval(f.read())
labels = [[0 for i in range(196)] for j in range(574)]
results = session.run(
"MATCH (a:user)-[r]->(b:issue)-[:belongsto]->(c:repository) where c.name='numpy_numpy' and b.state='closed' and b.number < 1000 return a,b")
for res in results:
user = res.get('a')._properties['id']
issue = res.get('b')._properties['name']
labels[id_table[issue]][id_table[user]-cnt] = 1
labels = np.array(labels)
np.savetxt('./labels.txt', labels, fmt='%d')
def get_id_table(session):
id_table = dict()
id = 0
#id<10000,issue,closed,numpy 5071 id<3000,2342;id<500,574
results = session.run("MATCH (b:issue)-[:belongsto]->(c:repository) where c.name='numpy_numpy' and b.state='closed' and b.number < 1000 return b")
for res in results:
tmp = res.get('b')._properties['name']
# if tmp in id_table:
# continue
id_table[tmp] = id
id += 1
print(id)
#id<10000,issue,closed,numpy 3181 id<3000,671;id<500,196
results = session.run("MATCH (a:user)-[r]->(b:issue)-[:belongsto]->(c:repository) where c.name='numpy_numpy' and b.state='closed' and b.number < 1000 return distinct a")
for res in results:
tmp = res.get('a')._properties['id']
# if tmp in id_table:
# continue
id_table[tmp] = id
id += 1
print(id)
with open('./id_table1000.txt', 'w', encoding='utf-8') as f:
f.write(str(id_table))
def get_adj(session):
# with open('./id_table.txt', 'r', encoding='utf-8') as f:
with open('./id_table1000.txt', 'r', encoding='utf-8') as f:
id_table = eval(f.read())
adj = [[0 for i in range(len(id_table))] for j in range(len(id_table))]
# print(len(adj),len(adj[0]))
results = session.run(
"MATCH (a:repository)<-[:belongsto]-(b:issue)-[]->(c:issue)-[:belongsto]->(d:repository) "
"where a.name='numpy_numpy' and b.state='closed' and b.id<1000 "
"and d.name='numpy_numpy' and c.state='closed' and c.number <1000 return b,c")
for res in results:
fr = res.get('b')._properties['name']
to = res.get('c')._properties['name']
adj[id_table[fr]][id_table[to]] = 1
adj[id_table[to]][id_table[fr]] = 1
results = session.run("MATCH (a:user)-[r]->(b:issue)-[:belongsto]->(c:repository) "
"where c.name='numpy_numpy' and b.state='closed' and b.number <1000 return a,b")
for res in results:
fr = res.get('a')._properties['id']
to = res.get('b')._properties['name']
adj[id_table[fr]][id_table[to]] = 1
adj[id_table[to]][id_table[fr]] = 1
adj = np.array(adj)
# np.savetxt('./adj.txt',adj, fmt='%d')
np.savetxt('./adj1000.txt', adj, fmt='%d')
print('adj finished')
def get_t(session):
tbegin = datetime.strptime("2024-03-01T15:09:30Z", "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
tend = datetime.strptime("2000-03-01T15:09:30Z", "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
results = session.run(
"MATCH (a:user)-[r]->(b:issue)-[:belongsto]->(c:repository) "
"where b.state='closed' and c.name='numpy_numpy' and b.number <1000 return a,b,c,r")
for res in results:
tmp = datetime.strptime(res.get('r')._properties['time'], "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
if tmp > tend:
tend = tmp
if tbegin > tmp:
tbegin = tmp
return tbegin,tend
def get_c(session,te,tb,cnt):
with open('./id_table1000.txt', 'r', encoding='utf-8') as f:
id_table = eval(f.read())
id_list = [0 for i in range(len(id_table))]
for key in id_table:
id_list[id_table[key]] = key
adj = np.loadtxt('./adj1000.txt', dtype=int)
u_v = adj[cnt:,0:cnt] #user-issue邻接矩阵
c_6 = [[0 for i in range(u_v.shape[0])] for j in range(u_v.shape[0])]
c_7 = [[0 for i in range(u_v.shape[0])] for j in range(u_v.shape[0])]
c_8 = [[0 for i in range(u_v.shape[0])] for j in range(u_v.shape[0])]
c_9 = [[0 for i in range(u_v.shape[0])] for j in range(u_v.shape[0])]
x = 0
for i in range(u_v.shape[0]):
for j in range(i+1,u_v.shape[0]):
pos = np.nonzero(u_v[i]*u_v[j])
pos = list(*pos)
# print(pos)
if not pos:
continue
x += 1
time_list = dict()
print(len(pos),end=' ')
for k in pos:
time_list[id_list[k]] = collections.defaultdict(list)
results = session.run(
"MATCH (a:user)-[r]->(b:issue) "
"where a.id="+str(id_list[i+cnt]) + " and b.name='"+str(id_list[k])+"' return a,b,r")
for res in results:
time_list[id_list[k]][res.get('r').type].append(
datetime.strptime(res.get('r')._properties['time'], "%Y-%m-%dT%H:%M:%SZ").replace(
tzinfo=timezone.utc))
results = session.run(
"MATCH (a:user)-[r]->(b:issue) "
"where a.id=" + str(id_list[j + cnt]) + " and b.name='" + str(id_list[k]) + "' return a,b,r")
for res in results:
time_list[id_list[k]][res.get('r').type].append(
datetime.strptime(res.get('r')._properties['time'], "%Y-%m-%dT%H:%M:%SZ").replace(
tzinfo=timezone.utc))
#计算c(u,u)
# print(x, tuple([i, j]), end=' ')
value_6 = comepute_c(time_list,0.6,te,tb)
value_7 = comepute_c(time_list, 0.7,te,tb)
value_8 = comepute_c(time_list, 0.8,te,tb)
value_9 = comepute_c(time_list, 0.9,te,tb)
c_6[i][j] = value_6
c_7[i][j] = value_7
c_8[i][j] = value_8
c_9[i][j] = value_9
print(x, tuple([i, j, value_6]), end=' ')
c_6 = np.array(c_6)
np.savetxt('./c_6_1000.txt', c_6, fmt='%f')
c_7 = np.array(c_7)
np.savetxt('./c_7_1000.txt', c_7, fmt='%f')
c_8 = np.array(c_8)
np.savetxt('./c_8_1000.txt', c_8, fmt='%f')
c_9 = np.array(c_9)
np.savetxt('./c_9_1000.txt', c_9, fmt='%f')
def comepute_c(time_list,sigma,te,tb):
value = 0
for o in time_list:
value0 = 0
for a in time_list[o]:
times = sorted(time_list[o][a], reverse=True)
value1 = 0
for t in range(len(times)):
value1 += sigma ** (len(times) - t - 1) * (times[t] - tb) / (te - tb)
value0 += value1 / len(times)
value += value0 / len(time_list[o])
value = value / len(time_list)
return value
def get_r(session,te,tb,cnt):
with open('./id_table1000.txt', 'r', encoding='utf-8') as f:
id_table = eval(f.read())
id_list = [0 for i in range(len(id_table))]
for key in id_table:
id_list[id_table[key]] = key
adj = np.loadtxt('./adj1000.txt', dtype=int)
u_v = adj[cnt:,0:cnt] #user-issue邻接矩阵
# v_v = adj[0:cnt][0:cnt]
r_6 = [[0 for i in range(u_v.shape[1])] for j in range(u_v.shape[0])]
r_7 = [[0 for i in range(u_v.shape[1])] for j in range(u_v.shape[0])]
r_8 = [[0 for i in range(u_v.shape[1])] for j in range(u_v.shape[0])]
r_9 = [[0 for i in range(u_v.shape[1])] for j in range(u_v.shape[0])]
for i in range(u_v.shape[0]):
for j in range(u_v.shape[1]):
if not u_v[i][j]:
continue
time_list = dict()
time_list[id_list[j]] = collections.defaultdict(list)
results = session.run(
"MATCH (a:user)-[r]->(b:issue) "
"where a.id="+str(id_list[i+cnt]) + " and b.name='"+str(id_list[j])+"' return a,b,r")
for res in results:
time_list[id_list[j]][res.get('r').type].append(
datetime.strptime(res.get('r')._properties['time'], "%Y-%m-%dT%H:%M:%SZ").replace(
tzinfo=timezone.utc))
#计算r(u,u)
value_6 = comepute_c(time_list,0.6,te,tb)
value_7 = comepute_c(time_list, 0.7,te,tb)
value_8 = comepute_c(time_list, 0.8,te,tb)
value_9 = comepute_c(time_list, 0.9,te,tb)
r_6[i][j] = value_6
r_7[i][j] = value_7
r_8[i][j] = value_8
r_9[i][j] = value_9
print(tuple([i,j,value_6]),end=' ')
r_6 = np.array(r_6)
np.savetxt('./r_6_1000.txt', r_6, fmt='%f')
r_7 = np.array(r_7)
np.savetxt('./r_7_1000.txt', r_7, fmt='%f')
r_8 = np.array(r_8)
np.savetxt('./r_8_1000.txt', r_8, fmt='%f')
r_9 = np.array(r_9)
np.savetxt('./r_9_1000.txt', r_9, fmt='%f')
def get_r_old(session,u_v,sigma,te,tb): #u行v列
with open('./id_table.txt', 'r', encoding='utf-8') as f:
id_table = eval(f.read())
issue_set = set()
r = [[0 for i in range(len(u_v[0]))] for j in range(len(u_v))]
for key in id_table:
if not isinstance(key,int):
issue_set.add(key)
for issue in issue_set:
results = session.run(
"MATCH (a:user)-[r]->(b:issue|pr)-[:belongsto]-(c:repository) where c.name='numpy_numpy' and b.name="+str(issue)+" return a,b,r")
cnt = dict()
for res in results:
if not res.get('a')._properties['id'] in cnt:
cnt[res.get('a')._properties['id']] = dict()
if not res.get('r').type in cnt[res.get('a')._properties['id']]:
cnt[res.get('a')._properties['id']][res.get('r').type] = list()
cnt[res.get('a')._properties['id']][res.get('r').type].append(datetime.strptime(res.get('r')._properties['time'], "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc))
for i in cnt:
value = 0
for j in cnt[i]:
value1 = 0
times = sorted(cnt[i][j],reverse = True)
for t in times:
value1 += sigma**(len(times)-t-1)*(t-tb)/(te-tb)
value += value1/len(times)
value /= len(cnt[i])
r[id_table[i] - len(u_v[0])][id_table[issue]] = value
r[id_table[issue]][id_table[i] - len(u_v[0])] = value
with open('./r.txt', 'w', encoding='utf-8') as f:
f.write(str(r))
def get_s1(cnt):
adj = np.loadtxt('./adj1000.txt', dtype=int)
v_u = adj[0:cnt,cnt:]
s1 = [[0 for i in range(v_u.shape[0])] for j in range(v_u.shape[0])]
for i in range(len(v_u)):
for j in range(len(v_u)):
s1[i][j] = v_u[i]@v_u[j]
print(i,end=' ')
s1 = np.array(s1)
np.savetxt('./s1_1000.txt', s1, fmt='%f')
def get_s(session,v_v):
with open('./id_table.txt', 'r', encoding='utf-8') as f:
id_table = eval(f.read())
s1 = [[0 for i in range(len(v_v))]for j in range(len(v_v))]
s2 = [[0 for i in range(len(v_v))] for j in range(len(v_v))]
cnt = collections.defaultdict(set)
results = session.run(
"MATCH (a:user)-[]->(b:issue|pr)-[:belongsto]-(c:repository) where c.name='numpy_numpy' and b.state='closed' return a,b")
for res in results:
issue = res.get('b')._properties['name']
cnt[issue].add(res.get('b')._properties['id'])
for k1 in cnt:
for k2 in cnt:
if k1 == k2:
continue
s1[id_table[k1]][id_table[k2]] = len(cnt[k1].intersection(cnt[k2]))/len(cnt[k1].union(cnt[k2]))
s2 = sim(k1,k2)
with open('./s1.txt', 'w', encoding='utf-8') as f:
f.write(str(s1))
with open('./s2.txt', 'w', encoding='utf-8') as f:
f.write(str(s2))
def get_num():
adj = np.loadtxt('./adj1000.txt', dtype=int)
num = np.sum(adj, axis=1)
np.savetxt('./num1000.txt', num, fmt='%d')
#
# with open('./num.txt', 'w', encoding='utf-8') as f:
# f.write(str(list(num)))
def reprocess(filename):
data = np.loadtxt(filename, dtype=float)
for i in range(data.shape[0]):
for j in range(data.shape[1]):
if not data[i][j]:
data[i][j] = data[j][i]
np.savetxt('./data/'+filename, data, fmt='%f')
def get_index_pairs(cnt):
adj = np.loadtxt('./adj1000.txt', dtype=int)
adj = adj[0:cnt,cnt:]
one_index = []
zero_index = []
for row in range(len(adj)):
ones = np.where(adj[row] == 1)[0] # 找出1的位置
zeros = np.where(adj[row] == 0)[0] # 找出0的位置
for one in ones:
for zero in zeros:
one_index.append([row,one+cnt])
zero_index.append([row,zero+cnt])
one_index = np.array(one_index)
zero_index = np.array(zero_index)
# print(zero_index.shape,one_index.shape)
np.savetxt('./one_index_1000.txt', one_index, fmt='%d')
np.savetxt('./zero_index_1000.txt',zero_index, fmt='%d')
if __name__ == "__main__":
# driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "12345678"))
# session = driver.session()
#
# get_id_table(session)
# get_adj(session)
# tbegin,tend = get_t(session)
# get_num()
# #3000
# # tbegin = datetime.strptime("2012-06-05T04:40:41Z", "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
# # tend = datetime.strptime("2024-02-15T21:17:43Z", "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
# # # #10000
# # # tbegin = datetime.strptime("2012-06-05T04:40:41Z", "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
# # # tend = datetime.strptime("2024-02-15T21:17:43Z", "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
#
# # sigma 0.6 0.7 0.8 0.9
# get_c(session, tend, tbegin, 574)
# get_r(session, tend, tbegin,574)
#
# get_labels(session,574)
#
# driver.close()
#
# get_s1(574)
#
# reprocess('c_6_1000.txt')
# reprocess('c_7_1000.txt')
# reprocess('c_8_1000.txt')
# reprocess('c_9_1000.txt')
get_index_pairs(574)