github_mongoDB_dataprocess/prepare_for_gat.py


from neo4j import GraphDatabase
import numpy as np
from datetime import datetime, timezone
import collections
# from doc2vec_model import sim
import ast

def get_labels(session,cnt):
    with open('./id_table1000.txt', 'r', encoding='utf-8') as f:
        id_table = eval(f.read())
    labels = [[0 for i in range(196)] for j in range(574)]
    results = session.run(
        "MATCH (a:user)-[r]->(b:issue)-[:belongsto]->(c:repository) where c.name='numpy_numpy' and b.state='closed' and b.number < 1000 return a,b")
    for res in results:
        user = res.get('a')._properties['id']
        issue = res.get('b')._properties['name']
        labels[id_table[issue]][id_table[user]-cnt] = 1
    labels = np.array(labels)
    np.savetxt('./labels.txt', labels, fmt='%d')

def get_id_table(session):

    id_table = dict()
    id = 0

    #id<10000,issue,closed,numpy 5071 id<3000,2342;id<500,574
    results = session.run("MATCH (b:issue)-[:belongsto]->(c:repository) where c.name='numpy_numpy' and b.state='closed' and b.number < 1000 return b")
    for res in results:
        tmp = res.get('b')._properties['name']
        # if tmp in id_table:
        #     continue
        id_table[tmp] = id
        id += 1
    print(id)
    #id<10000,issue,closed,numpy 3181 id<3000,671;id<500,196
    results = session.run("MATCH (a:user)-[r]->(b:issue)-[:belongsto]->(c:repository) where c.name='numpy_numpy' and b.state='closed' and b.number < 1000 return distinct a")
    for res in results:
        tmp = res.get('a')._properties['id']
        # if tmp in id_table:
        #     continue
        id_table[tmp] = id
        id += 1
    print(id)
    with open('./id_table1000.txt', 'w', encoding='utf-8') as f:
        f.write(str(id_table))

def get_adj(session):

    # with open('./id_table.txt', 'r', encoding='utf-8') as f:
    with open('./id_table1000.txt', 'r', encoding='utf-8') as f:
        id_table = eval(f.read())
    adj = [[0 for i in range(len(id_table))] for j in range(len(id_table))]
    # print(len(adj),len(adj[0]))
    results = session.run(
        "MATCH (a:repository)<-[:belongsto]-(b:issue)-[]->(c:issue)-[:belongsto]->(d:repository) "
        "where a.name='numpy_numpy' and b.state='closed' and b.id<1000 "
        "and d.name='numpy_numpy' and c.state='closed' and c.number <1000 return b,c")
    for res in results:
        fr = res.get('b')._properties['name']
        to = res.get('c')._properties['name']
        adj[id_table[fr]][id_table[to]] = 1
        adj[id_table[to]][id_table[fr]] = 1
    results = session.run("MATCH (a:user)-[r]->(b:issue)-[:belongsto]->(c:repository) "
                          "where c.name='numpy_numpy' and b.state='closed' and b.number <1000 return a,b")
    for res in results:
        fr = res.get('a')._properties['id']
        to = res.get('b')._properties['name']
        adj[id_table[fr]][id_table[to]] = 1
        adj[id_table[to]][id_table[fr]] = 1
    adj = np.array(adj)
    # np.savetxt('./adj.txt',adj, fmt='%d')
    np.savetxt('./adj1000.txt', adj, fmt='%d')
    print('adj finished')

def get_t(session):

    tbegin = datetime.strptime("2024-03-01T15:09:30Z", "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
    tend = datetime.strptime("2000-03-01T15:09:30Z", "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
    results = session.run(
        "MATCH (a:user)-[r]->(b:issue)-[:belongsto]->(c:repository) "
        "where b.state='closed' and c.name='numpy_numpy' and b.number <1000 return a,b,c,r")
    for res in results:
        tmp = datetime.strptime(res.get('r')._properties['time'], "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
        if tmp > tend:
            tend = tmp
        if tbegin > tmp:
            tbegin = tmp
    return tbegin,tend

def get_c(session,te,tb,cnt):

    with open('./id_table1000.txt', 'r', encoding='utf-8') as f:
        id_table = eval(f.read())

    id_list = [0 for i in range(len(id_table))]
    for key in id_table:
        id_list[id_table[key]] = key
    adj = np.loadtxt('./adj1000.txt', dtype=int)
    u_v = adj[cnt:,0:cnt] #user-issue邻接矩阵

    c_6 = [[0 for i in range(u_v.shape[0])] for j in range(u_v.shape[0])]
    c_7 = [[0 for i in range(u_v.shape[0])] for j in range(u_v.shape[0])]
    c_8 = [[0 for i in range(u_v.shape[0])] for j in range(u_v.shape[0])]
    c_9 = [[0 for i in range(u_v.shape[0])] for j in range(u_v.shape[0])]
    x = 0
    for i in range(u_v.shape[0]):
        for j in range(i+1,u_v.shape[0]):
            pos = np.nonzero(u_v[i]*u_v[j])
            pos = list(*pos)
            # print(pos)
            if not pos:
                continue
            x += 1
            time_list = dict()
            print(len(pos),end=' ')
            for k in pos:
                time_list[id_list[k]] = collections.defaultdict(list)
                results = session.run(
                    "MATCH (a:user)-[r]->(b:issue) "
                    "where a.id="+str(id_list[i+cnt]) + " and b.name='"+str(id_list[k])+"' return a,b,r")
                for res in results:
                    time_list[id_list[k]][res.get('r').type].append(
                        datetime.strptime(res.get('r')._properties['time'], "%Y-%m-%dT%H:%M:%SZ").replace(
                            tzinfo=timezone.utc))
                results = session.run(
                    "MATCH (a:user)-[r]->(b:issue) "
                    "where a.id=" + str(id_list[j + cnt]) + " and b.name='" + str(id_list[k]) + "' return a,b,r")
                for res in results:
                    time_list[id_list[k]][res.get('r').type].append(
                        datetime.strptime(res.get('r')._properties['time'], "%Y-%m-%dT%H:%M:%SZ").replace(
                            tzinfo=timezone.utc))
            #计算c(u,u)
            # print(x, tuple([i, j]), end=' ')
            value_6 = comepute_c(time_list,0.6,te,tb)
            value_7 = comepute_c(time_list, 0.7,te,tb)
            value_8 = comepute_c(time_list, 0.8,te,tb)
            value_9 = comepute_c(time_list, 0.9,te,tb)
            c_6[i][j] = value_6
            c_7[i][j] = value_7
            c_8[i][j] = value_8
            c_9[i][j] = value_9
            print(x, tuple([i, j, value_6]), end=' ')
    c_6 = np.array(c_6)
    np.savetxt('./c_6_1000.txt', c_6, fmt='%f')
    c_7 = np.array(c_7)
    np.savetxt('./c_7_1000.txt', c_7, fmt='%f')
    c_8 = np.array(c_8)
    np.savetxt('./c_8_1000.txt', c_8, fmt='%f')
    c_9 = np.array(c_9)
    np.savetxt('./c_9_1000.txt', c_9, fmt='%f')
def comepute_c(time_list,sigma,te,tb):

    value = 0
    for o in time_list:
        value0 = 0
        for a in time_list[o]:
            times = sorted(time_list[o][a], reverse=True)
            value1 = 0
            for t in range(len(times)):
                value1 += sigma ** (len(times) - t - 1) * (times[t] - tb) / (te - tb)
            value0 += value1 / len(times)
        value += value0 / len(time_list[o])
    value = value / len(time_list)
    return value

def get_r(session,te,tb,cnt):

    with open('./id_table1000.txt', 'r', encoding='utf-8') as f:
        id_table = eval(f.read())

    id_list = [0 for i in range(len(id_table))]
    for key in id_table:
        id_list[id_table[key]] = key
    adj = np.loadtxt('./adj1000.txt', dtype=int)
    u_v = adj[cnt:,0:cnt] #user-issue邻接矩阵
    # v_v = adj[0:cnt][0:cnt]

    r_6 = [[0 for i in range(u_v.shape[1])] for j in range(u_v.shape[0])]
    r_7 = [[0 for i in range(u_v.shape[1])] for j in range(u_v.shape[0])]
    r_8 = [[0 for i in range(u_v.shape[1])] for j in range(u_v.shape[0])]
    r_9 = [[0 for i in range(u_v.shape[1])] for j in range(u_v.shape[0])]
    for i in range(u_v.shape[0]):
        for j in range(u_v.shape[1]):
            if not u_v[i][j]:
                continue
            time_list = dict()
            time_list[id_list[j]] = collections.defaultdict(list)
            results = session.run(
                "MATCH (a:user)-[r]->(b:issue) "
                "where a.id="+str(id_list[i+cnt]) + " and b.name='"+str(id_list[j])+"' return a,b,r")
            for res in results:
                time_list[id_list[j]][res.get('r').type].append(
                    datetime.strptime(res.get('r')._properties['time'], "%Y-%m-%dT%H:%M:%SZ").replace(
                        tzinfo=timezone.utc))

            #计算r(u,u)
            value_6 = comepute_c(time_list,0.6,te,tb)
            value_7 = comepute_c(time_list, 0.7,te,tb)
            value_8 = comepute_c(time_list, 0.8,te,tb)
            value_9 = comepute_c(time_list, 0.9,te,tb)
            r_6[i][j] = value_6
            r_7[i][j] = value_7
            r_8[i][j] = value_8
            r_9[i][j] = value_9
            print(tuple([i,j,value_6]),end=' ')

    r_6 = np.array(r_6)
    np.savetxt('./r_6_1000.txt', r_6, fmt='%f')
    r_7 = np.array(r_7)
    np.savetxt('./r_7_1000.txt', r_7, fmt='%f')
    r_8 = np.array(r_8)
    np.savetxt('./r_8_1000.txt', r_8, fmt='%f')
    r_9 = np.array(r_9)
    np.savetxt('./r_9_1000.txt', r_9, fmt='%f')

def get_r_old(session,u_v,sigma,te,tb): #u行v列

    with open('./id_table.txt', 'r', encoding='utf-8') as f:
        id_table = eval(f.read())
    issue_set = set()
    r = [[0 for i in range(len(u_v[0]))] for j in range(len(u_v))]
    for key in id_table:
        if not isinstance(key,int):
            issue_set.add(key)

    for issue in issue_set:
        results = session.run(
            "MATCH (a:user)-[r]->(b:issue|pr)-[:belongsto]-(c:repository) where c.name='numpy_numpy' and b.name="+str(issue)+" return a,b,r")
        cnt = dict()
        for res in results:
            if not res.get('a')._properties['id'] in cnt:
                cnt[res.get('a')._properties['id']] = dict()
            if not res.get('r').type in cnt[res.get('a')._properties['id']]:
                cnt[res.get('a')._properties['id']][res.get('r').type] = list()
                cnt[res.get('a')._properties['id']][res.get('r').type].append(datetime.strptime(res.get('r')._properties['time'], "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc))
        for i in cnt:
            value = 0
            for j in cnt[i]:
                value1 = 0
                times = sorted(cnt[i][j],reverse = True)
                for t in times:
                    value1 += sigma**(len(times)-t-1)*(t-tb)/(te-tb)
                value += value1/len(times)
            value /= len(cnt[i])
            r[id_table[i] - len(u_v[0])][id_table[issue]] = value
            r[id_table[issue]][id_table[i] - len(u_v[0])] = value
    with open('./r.txt', 'w', encoding='utf-8') as f:
        f.write(str(r))

def get_s1(cnt):

    adj = np.loadtxt('./adj1000.txt', dtype=int)
    v_u = adj[0:cnt,cnt:]
    s1 = [[0 for i in range(v_u.shape[0])] for j in range(v_u.shape[0])]
    for i in range(len(v_u)):
        for j in range(len(v_u)):
            s1[i][j] = v_u[i]@v_u[j]
        print(i,end=' ')
    s1 = np.array(s1)
    np.savetxt('./s1_1000.txt', s1, fmt='%f')

def get_s(session,v_v):

    with open('./id_table.txt', 'r', encoding='utf-8') as f:
        id_table = eval(f.read())
    s1 = [[0 for i in range(len(v_v))]for j in range(len(v_v))]
    s2 = [[0 for i in range(len(v_v))] for j in range(len(v_v))]
    cnt = collections.defaultdict(set)
    results = session.run(
        "MATCH (a:user)-[]->(b:issue|pr)-[:belongsto]-(c:repository) where c.name='numpy_numpy' and b.state='closed' return a,b")
    for res in results:
        issue = res.get('b')._properties['name']
        cnt[issue].add(res.get('b')._properties['id'])
    for k1 in cnt:
        for k2 in cnt:
            if k1 == k2:
                continue
            s1[id_table[k1]][id_table[k2]] = len(cnt[k1].intersection(cnt[k2]))/len(cnt[k1].union(cnt[k2]))
            s2 = sim(k1,k2)

    with open('./s1.txt', 'w', encoding='utf-8') as f:
        f.write(str(s1))
    with open('./s2.txt', 'w', encoding='utf-8') as f:
        f.write(str(s2))

def get_num():

    adj = np.loadtxt('./adj1000.txt', dtype=int)
    num = np.sum(adj, axis=1)
    np.savetxt('./num1000.txt', num, fmt='%d')
    #
    # with open('./num.txt', 'w', encoding='utf-8') as f:
    #     f.write(str(list(num)))

def reprocess(filename):

    data = np.loadtxt(filename, dtype=float)
    for i in range(data.shape[0]):
        for j in range(data.shape[1]):
            if not data[i][j]:
                data[i][j] = data[j][i]
    np.savetxt('./data/'+filename, data, fmt='%f')

def get_index_pairs(cnt):
    adj = np.loadtxt('./adj1000.txt', dtype=int)
    adj = adj[0:cnt,cnt:]
    one_index = []
    zero_index = []
    for row in range(len(adj)):
        ones = np.where(adj[row] == 1)[0]  # 找出1的位置
        zeros = np.where(adj[row] == 0)[0]  # 找出0的位置
        for one in ones:
            for zero in zeros:
                one_index.append([row,one+cnt])
                zero_index.append([row,zero+cnt])
    one_index = np.array(one_index)
    zero_index = np.array(zero_index)
    # print(zero_index.shape,one_index.shape)
    np.savetxt('./one_index_1000.txt', one_index, fmt='%d')
    np.savetxt('./zero_index_1000.txt',zero_index, fmt='%d')


if __name__ == "__main__":
    # driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "12345678"))
    # session = driver.session()
    #
    # get_id_table(session)
    # get_adj(session)
    # tbegin,tend = get_t(session)
    # get_num()
    # #3000
    # # tbegin = datetime.strptime("2012-06-05T04:40:41Z", "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
    # # tend = datetime.strptime("2024-02-15T21:17:43Z", "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
    # # # #10000
    # # # tbegin = datetime.strptime("2012-06-05T04:40:41Z", "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
    # # # tend = datetime.strptime("2024-02-15T21:17:43Z", "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
    #
    # # sigma 0.6 0.7 0.8 0.9
    # get_c(session, tend, tbegin, 574)
    # get_r(session, tend, tbegin,574)
    #
    # get_labels(session,574)
    #
    # driver.close()
    #
    # get_s1(574)
    #
    # reprocess('c_6_1000.txt')
    # reprocess('c_7_1000.txt')
    # reprocess('c_8_1000.txt')
    # reprocess('c_9_1000.txt')
    get_index_pairs(574)