github_mongoDB_dataprocess/get_rela_json.py

#
# from:
# to:
# attr:
# time:


import os
import pandas as pd
import json
import collections
import pymongo
import ast
from get_entity_json import get_issues_and_users

def get_fork_json():
    tmp = []
    file_names = os.listdir('C:/Users/zhou/Desktop/github_mongoDB/fork_issue_number')
    for file_name in file_names:
        file_path = os.path.join('C:/Users/zhou/Desktop/github_mongoDB/fork_issue_number', file_name)
        df = pd.read_excel(file_path)
        filtered_repos = df[df['number'] > 0]['repo_name'].tolist()
        forked_repo = file_name[:file_name.find('_fork_repo_issues')]
        for repos in filtered_repos:
            tmp.append({
                "from":repos.replace('/','_'),
                "to":forked_repo
            })
    with open('./json/relationship/fork.json', 'w') as f:
        json.dump(tmp, f, indent=4)

def get_belongto_json(size):

    # 初始化
    cnt = 0
    tmp = []
    issue_set = collections.defaultdict(set)

    # 数据库链接
    client = pymongo.MongoClient("mongodb://localhost:27017/")
    # numpy及其fork
    db = client['numpy_db']
    for collection_name in db.list_collection_names():
        if 'issue&pr' in collection_name:
            name = collection_name[:collection_name.find('issue&pr') - 1]
            collection = db[collection_name]
            issues = collection.find(
                {"created_at": {"$lt": "2024-03-01T00:00:00Z"}},
                {'number': 1, 'user': 1,'url':1, '_id': 0})
            for issue in issues:
                # 筛掉重复的issue&pr
                if issue['number'] in issue_set[name]:
                    continue
                issue_set[name].add(issue['number'])
                tmp.append({
                    "from":name +"+"+  str(issue['number']),
                    "to":name
                })
                cnt += 1
                if not cnt%size:
                    with open('./json/relationship/belongto'+str(cnt//size)+'.json', 'w') as f:
                        json.dump(tmp, f, indent=4)
                    tmp = []

    # numpy及其fork的上下游
    with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
        repo_list = set(ast.literal_eval(f.read()))
    db = client['reference_db']
    for collection_name in db.list_collection_names():
        # 删除数据库中非上下游的仓库的信息
        if collection_name.replace('_', '/', 1) not in repo_list:
            continue
        collection = db[collection_name]
        issues = collection.find(
            {"created_at": {"$lt": "2024-03-01T00:00:00Z"}},
            {'number': 1, 'user': 1,'url':1, '_id': 0})
        for issue in issues:
            if issue['number'] in issue_set[collection_name]:
                continue
            issue_set[collection_name].add(issue['number'])
            tmp.append({
                "from": collection_name +"+"+  str(issue['number']),
                "to": collection_name
            })
            cnt += 1
            if not cnt % size:
                with open('./json/relationship/belongto' + str(cnt // size) + '.json', 'w') as f:
                    json.dump(tmp, f, indent=4)
                tmp = []

    # 上下游的fork
    db = client['fork_db']
    for collection_name in db.list_collection_names():
        collection = db[collection_name]
        issues = collection.find(
            {"created_at": {"$lt": "2024-03-01T00:00:00Z"}},
            {'number': 1, 'user': 1,'url':1, '_id': 0})
        for issue in issues:
            if issue['number'] in issue_set[collection_name]:
                continue
            issue_set[collection_name].add(issue['number'])
            tmp.append({
                "from": collection_name +"+"+  str(issue['number']),
                "to": collection_name
            })
            cnt += 1
            if not cnt % size:
                with open('./json/relationship/belongto' + str(cnt // size) + '.json', 'w') as f:
                    json.dump(tmp, f, indent=4)
                tmp = []
    if tmp:
        with open('./json/relationship/belongto' + str(cnt // size+1) + '.json', 'w') as f:
            json.dump(tmp, f, indent=4)

    print(cnt)
    return None

def get_mentionedin_closed_json(issue_set,pr_set,event_type,actor,rela,size):

    tmp = []
    cnt = 0
    # 数据库链接
    client = pymongo.MongoClient("mongodb://localhost:27017/")
    # numpy及其fork
    db = client['numpy_db']
    for collection_name in db.list_collection_names():
        if '_timeline' in collection_name:
            name = collection_name[:collection_name.find('_timeline')]
            collection = db[collection_name]
            # 取出的事件满足条件:issue建立于2024年3月之前；事件类型符合，事件发生在2024年3月之前
            events = collection.find({"issue_number": {"$in":list(issue_set[name])+list(pr_set[name])},
                                      "created_at": {"$lt": "2024-03-01T00:00:00Z"},
                                      'event':event_type},{actor: 1,'issue_number':1,'created_at':1, '_id': 0})
            #存在部分事件对象信息缺失
            for event in events:
                try:
                    tmp.append({
                        "from":event[actor]['id'],
                        "to":name +"+"+  str(event['issue_number']),
                        "time":event['created_at']
                    })
                    cnt += 1
                    if not cnt % size:
                        with open('./json/relationship/'+rela+ str(cnt // size) + '.json', 'w') as f:
                            json.dump(tmp, f, indent=4)
                        tmp = []
                except Exception as e:
                    pass
    # numpy及其fork的上下游
    with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
        repo_list = set(ast.literal_eval(f.read()))
    db = client['reference_timeline']
    for collection_name in db.list_collection_names():
        # 删除数据库中非上下游的仓库的信息
        if collection_name.replace('_', '/', 1) not in repo_list:
            continue
        collection = db[collection_name]
        events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
                                  "created_at": {"$lt": "2024-03-01T00:00:00Z"},
                                  'event': event_type}, {actor: 1,'issue_number':1,'created_at':1, '_id': 0})
        for event in events:
            try:
                tmp.append({
                    "from": event[actor]['id'],
                    "to": collection_name +"+"+  str(event['issue_number']),
                    "time": event['created_at']
                })
                cnt += 1
                if not cnt % size:
                    with open('./json/relationship/'+rela + str(cnt // size) + '.json', 'w') as f:
                        json.dump(tmp, f, indent=4)
                    tmp = []
            except Exception as e:
                pass
    # 上下游的fork
    db = client['fork_reference']
    for collection_name in db.list_collection_names():
        collection = db[collection_name]
        events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
                                  "created_at": {"$lt": "2024-03-01T00:00:00Z"},
                                  'event': event_type}, {actor: 1,'issue_number':1,'created_at':1, '_id': 0})
        for event in events:
            try:
                tmp.append({
                    "from": event[actor]['id'],
                    "to": collection_name +"+"+  str(event['issue_number']),
                    "time": event['created_at']
                })
                cnt += 1
                if not cnt % size:
                    with open('./json/relationship/'+rela + str(cnt // size) + '.json', 'w') as f:
                        json.dump(tmp, f, indent=4)
                    tmp = []
            except Exception as e:
                pass
    if tmp:
        with open('./json/relationship/'+rela + str(cnt // size+1) + '.json', 'w') as f:
            json.dump(tmp, f, indent=4)

def get_opened_json(size):

    # 初始化
    cnt = 0
    tmp = []
    issue_set = collections.defaultdict(set)

    # 数据库链接
    client = pymongo.MongoClient("mongodb://localhost:27017/")
    # numpy及其fork
    db = client['numpy_db']
    for collection_name in db.list_collection_names():
        if 'issue&pr' in collection_name:
            name = collection_name[:collection_name.find('issue&pr') - 1]
            collection = db[collection_name]
            issues = collection.find(
                {"created_at": {"$lt": "2024-03-01T00:00:00Z"}},
                {'number': 1, 'user': 1,'url':1,'created_at':1, '_id': 0})
            for issue in issues:
                # 筛掉重复的issue&pr
                if issue['number'] in issue_set[name]:
                    continue
                issue_set[name].add(issue['number'])
                tmp.append({
                    "from":issue['user']['id'],
                    "to":name +"+"+  str(issue['number']),
                    "time":issue['created_at']
                })
                cnt += 1
                if not cnt%size:
                    with open('./json/relationship/opened'+str(cnt//size)+'.json', 'w') as f:
                        json.dump(tmp, f, indent=4)
                    tmp = []

    # numpy及其fork的上下游
    with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
        repo_list = set(ast.literal_eval(f.read()))
    db = client['reference_db']
    for collection_name in db.list_collection_names():
        # 删除数据库中非上下游的仓库的信息
        if collection_name.replace('_', '/', 1) not in repo_list:
            continue
        collection = db[collection_name]
        issues = collection.find(
            {"created_at": {"$lt": "2024-03-01T00:00:00Z"}},
            {'number': 1, 'user': 1,'url':1,'created_at':1, '_id': 0})
        for issue in issues:
            if issue['number'] in issue_set[collection_name]:
                continue
            issue_set[collection_name].add(issue['number'])
            tmp.append({
                "from": issue['user']['id'],
                "to": collection_name +"+"+  str(issue['number']),
                "time": issue['created_at']
            })
            cnt += 1
            if not cnt % size:
                with open('./json/relationship/opened' + str(cnt // size) + '.json', 'w') as f:
                    json.dump(tmp, f, indent=4)
                tmp = []

    # 上下游的fork
    db = client['fork_db']
    for collection_name in db.list_collection_names():
        collection = db[collection_name]
        issues = collection.find(
            {"created_at": {"$lt": "2024-03-01T00:00:00Z"}},
            {'number': 1, 'user': 1,'url':1,'created_at':1, '_id': 0})
        for issue in issues:
            if issue['number'] in issue_set[collection_name]:
                continue
            issue_set[collection_name].add(issue['number'])
            tmp.append({
                "from": issue['user']['id'],
                "to": collection_name +"+"+  str(issue['number']),
                "time": issue['created_at']
            })
            cnt += 1
            if not cnt % size:
                with open('./json/relationship/opened' + str(cnt // size) + '.json', 'w') as f:
                    json.dump(tmp, f, indent=4)
                tmp = []
    if tmp:
        with open('./json/relationship/opened' + str(cnt // size+1) + '.json', 'w') as f:
            json.dump(tmp, f, indent=4)

    print(cnt)
    return None

def get_commented_json(issue_set,pr_set,size):

    tmp = []
    cnt = 0
    # 数据库链接
    client = pymongo.MongoClient("mongodb://localhost:27017/")
    # numpy及其fork
    db = client['numpy_db']
    for collection_name in db.list_collection_names():
        if '_timeline' in collection_name:
            name = collection_name[:collection_name.find('_timeline')]
            collection = db[collection_name]
            # 取出的事件满足条件:issue建立于2024年3月之前；事件类型符合，事件发生在2024年3月之前
            events = collection.find({"issue_number": {"$in":list(issue_set[name])+list(pr_set[name])},
                                      "created_at": {"$lt": "2024-03-01T00:00:00Z"},
                                      'event':'commented'},{'user': 1,'issue_number':1,'created_at':1,'body':1, 'author_association':1, '_id': 0})
            #存在部分事件对象信息缺失
            for event in events:
                try:
                    tmp.append({
                        "from":event['user']['id'],
                        "to":name +"+"+  str(event['issue_number']),
                        "time":event['created_at'],
                        "body":event['body'],
                        "author_association":event['author_association']
                    })
                    cnt += 1
                    if not cnt % size:
                        with open('./json/relationship/'+'commented'+ str(cnt // size) + '.json', 'w') as f:
                            json.dump(tmp, f, indent=4)
                        tmp = []
                except Exception as e:
                    pass
    # numpy及其fork的上下游
    with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
        repo_list = set(ast.literal_eval(f.read()))
    db = client['reference_timeline']
    for collection_name in db.list_collection_names():
        # 删除数据库中非上下游的仓库的信息
        if collection_name.replace('_', '/', 1) not in repo_list:
            continue
        collection = db[collection_name]
        events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
                                  "created_at": {"$lt": "2024-03-01T00:00:00Z"},
                                  'event': 'commented'}, {'user': 1,'issue_number':1,'created_at':1,'body':1, 'author_association':1, '_id': 0})
        for event in events:
            try:
                tmp.append({
                    "from": event['user']['id'],
                    "to": collection_name +"+"+  str(event['issue_number']),
                    "time": event['created_at'],
                    "body":event['body'],
                    "author_association":event['author_association']
                })
                cnt += 1
                if not cnt % size:
                    with open('./json/relationship/'+'commented' + str(cnt // size) + '.json', 'w') as f:
                        json.dump(tmp, f, indent=4)
                    tmp = []
            except Exception as e:
                pass
    # 上下游的fork
    db = client['fork_reference']
    for collection_name in db.list_collection_names():
        collection = db[collection_name]
        events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
                                  "created_at": {"$lt": "2024-03-01T00:00:00Z"},
                                  'event': 'commented'}, {'user': 1,'issue_number':1,'created_at':1,'body':1, 'author_association':1, '_id': 0})
        for event in events:
            try:
                tmp.append({
                    "from": event['user']['id'],
                    "to": collection_name +"+"+  str(event['issue_number']),
                    "time": event['created_at'],
                    "body":event['body'],
                    "author_association":event['author_association']
                })
                cnt += 1
                if not cnt % size:
                    with open('./json/relationship/'+'commented' + str(cnt // size) + '.json', 'w') as f:
                        json.dump(tmp, f, indent=4)
                    tmp = []
            except Exception as e:
                pass
    if tmp:
        with open('./json/relationship/'+'commented' + str(cnt // size+1) + '.json', 'w') as f:
            json.dump(tmp, f, indent=4)
    print(cnt,'commented')

def get_link_depend_json(issue_set,pr_set,size):

    tmp = []
    cnt = 0
    repo_rely = collections.defaultdict(set)
    all_repos = {i for i in issue_set.keys() if issue_set[i]}.union({i for i in pr_set.keys() if pr_set[i]})
    # 数据库链接
    client = pymongo.MongoClient("mongodb://localhost:27017/")
    # numpy及其fork
    db = client['numpy_db']
    for collection_name in db.list_collection_names():
        if '_timeline' in collection_name:
            name = collection_name[:collection_name.find('_timeline')]
            collection = db[collection_name]
            # 取出的事件满足条件:issue建立于2024年3月之前；事件类型符合，事件发生在2024年3月之前
            events = collection.find({"issue_number": {"$in":list(issue_set[name])+list(pr_set[name])},
                                      "created_at": {"$lt": "2024-03-01T00:00:00Z"},
                                      'event':'cross-referenced'},{'source': 1,'issue_number':1,'created_at':1,'actor':1,'_id': 0})
            #存在部分事件对象信息缺失
            for event in events:
                try:
                    repo = event['source']['issue']['repository_url'][event['source']['issue']['repository_url'].rfind('/',0,event['source']['issue']['repository_url'].rfind('/'))+1:].replace('/','_')
                    #筛掉repo节点外的关系
                    if repo not in all_repos:
                        continue
                    tmp.append({
                        "from":repo+"+"+ str(event['source']['issue']['number']),
                        "to":name +"+"+  str(event['issue_number']),
                        "actor":event['actor']['id'],
                        "time":event['created_at']
                    })
                    repo_rely[repo].add(name)
                    cnt += 1
                    if not cnt % size:
                        with open('./json/relationship/'+'link'+ str(cnt // size) + '.json', 'w') as f:
                            json.dump(tmp, f, indent=4)
                        tmp = []
                except Exception as e:
                    pass
    # numpy及其fork的上下游
    with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
        repo_list = set(ast.literal_eval(f.read()))
    db = client['reference_timeline']
    for collection_name in db.list_collection_names():
        # 删除数据库中非上下游的仓库的信息
        if collection_name.replace('_', '/', 1) not in repo_list:
            continue
        collection = db[collection_name]
        events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
                                  "created_at": {"$lt": "2024-03-01T00:00:00Z"},
                                  'event': 'cross-referenced'}, {'source': 1,'issue_number':1,'created_at':1,'actor':1,'_id': 0})
        for event in events:
            try:
                repo = event['source']['issue']['repository_url'].rfind('/', 0, event['source']['issue'][
                    'repository_url'].rfind('/')).replace('/', '_')
                if repo not in all_repos:
                    continue
                tmp.append({
                    "from": repo +"+"+  str(event['source']['issue']['number']),
                    "to": collection_name +"+"+  str(event['issue_number']),
                    "actor": event['actor']['id'],
                    "time": event['created_at']
                })
                repo_rely[repo].add(collection_name)
                cnt += 1
                if not cnt % size:
                    with open('./json/relationship/' + 'link' + str(cnt // size) + '.json', 'w') as f:
                        json.dump(tmp, f, indent=4)
                    tmp = []
            except Exception as e:
                pass
    # 上下游的fork
    db = client['fork_reference']
    for collection_name in db.list_collection_names():
        collection = db[collection_name]
        events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
                                  "created_at": {"$lt": "2024-03-01T00:00:00Z"},
                                  'event': 'cross-referenced'}, {'source': 1,'issue_number':1,'created_at':1,'actor':1,'_id': 0})
        for event in events:
            try:
                repo = event['source']['issue']['repository_url'].rfind('/', 0, event['source']['issue'][
                    'repository_url'].rfind('/')).replace('/', '_')
                if repo not in all_repos:
                    continue
                tmp.append({
                    "from": repo +"+"+  str(event['source']['issue']['number']),
                    "to": collection_name +"+"+  str(event['issue_number']),
                    "actor": event['actor']['id'],
                    "time": event['created_at']
                })
                repo_rely[repo].add(collection_name)
                cnt += 1
                if not cnt % size:
                    with open('./json/relationship/' + 'link' + str(cnt // size) + '.json', 'w') as f:
                        json.dump(tmp, f, indent=4)
                    tmp = []
            except Exception as e:
                pass
    if tmp:
        with open('./json/relationship/'+'link' + str(cnt // size+1) + '.json', 'w') as f:
            json.dump(tmp, f, indent=4)
    print(cnt,'link')
    #depend
    depend = []
    for repo in repo_rely.keys():
        for linked_repo in repo_rely[repo]:
            depend.append({
                "from":repo,
                "to":linked_repo
            })
            if not len(depend) % size:
                with open('./json/relationship/' + 'depend' + str(len(depend) // size) + '.json', 'w') as f:
                    json.dump(depend, f, indent=4)
                depend = []
    if depend:
        with open('./json/relationship/' + 'depend' + str(len(depend) // size + 1) + '.json', 'w') as f:
            json.dump(depend, f, indent=4)
    print(len(depend),'depend')
def get_labeled_json(issue_set,pr_set,size):

    tmp = []
    cnt = 0
    # 数据库链接
    client = pymongo.MongoClient("mongodb://localhost:27017/")
    # numpy及其fork
    db = client['numpy_db']
    for collection_name in db.list_collection_names():
        if '_timeline' in collection_name:
            name = collection_name[:collection_name.find('_timeline')]
            collection = db[collection_name]
            # 取出的事件满足条件:issue建立于2024年3月之前；事件类型符合，事件发生在2024年3月之前
            events = collection.find({"issue_number": {"$in":list(issue_set[name])+list(pr_set[name])},
                                      "created_at": {"$lt": "2024-03-01T00:00:00Z"},
                                      'event':'labeled'},{'actor': 1,'issue_number':1,'created_at':1, 'label':1, '_id': 0})
            #存在部分事件对象信息缺失
            for event in events:
                try:
                    tmp.append({
                        "from":event['actor']['id'],
                        "to":name +"+"+  str(event['issue_number']),
                        "time":event['created_at'],
                        "label":event['label']['name']
                    })
                    cnt += 1
                    if not cnt % size:
                        with open('./json/relationship/'+'labeled'+ str(cnt // size) + '.json', 'w') as f:
                            json.dump(tmp, f, indent=4)
                        tmp = []
                except Exception as e:
                    pass
    # numpy及其fork的上下游
    with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
        repo_list = set(ast.literal_eval(f.read()))
    db = client['reference_timeline']
    for collection_name in db.list_collection_names():
        # 删除数据库中非上下游的仓库的信息
        if collection_name.replace('_', '/', 1) not in repo_list:
            continue
        collection = db[collection_name]
        events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
                                  "created_at": {"$lt": "2024-03-01T00:00:00Z"},
                                  'event':'labeled'},{'actor': 1,'issue_number':1,'created_at':1, 'label':1, '_id': 0})
        for event in events:
            try:
                tmp.append({
                    "from": event['actor']['id'],
                    "to": collection_name +"+"+  str(event['issue_number']),
                    "time": event['created_at'],
                    "label": event['label']['name']
                })
                cnt += 1
                if not cnt % size:
                    with open('./json/relationship/' + 'labeled' + str(cnt // size) + '.json', 'w') as f:
                        json.dump(tmp, f, indent=4)
                    tmp = []
            except Exception as e:
                pass
    # 上下游的fork
    db = client['fork_reference']
    for collection_name in db.list_collection_names():
        collection = db[collection_name]
        events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
                                  "created_at": {"$lt": "2024-03-01T00:00:00Z"},
                                  'event':'labeled'},{'actor': 1,'issue_number':1,'created_at':1, 'label':1, '_id': 0})
        for event in events:
            try:
                tmp.append({
                    "from": event['actor']['id'],
                    "to": collection_name +"+"+  str(event['issue_number']),
                    "time": event['created_at'],
                    "label": event['label']['name']
                })
                cnt += 1
                if not cnt % size:
                    with open('./json/relationship/' + 'labeled' + str(cnt // size) + '.json', 'w') as f:
                        json.dump(tmp, f, indent=4)
                    tmp = []
            except Exception as e:
                pass
    if tmp:
        with open('./json/relationship/'+'labeled' + str(cnt // size+1) + '.json', 'w') as f:
            json.dump(tmp, f, indent=4)
    print(cnt,'labeled')
def get_assigned_json(issue_set,pr_set,size):

    tmp = []
    cnt = 0
    # 数据库链接
    client = pymongo.MongoClient("mongodb://localhost:27017/")
    # numpy及其fork
    db = client['numpy_db']
    for collection_name in db.list_collection_names():
        if '_timeline' in collection_name:
            name = collection_name[:collection_name.find('_timeline')]
            collection = db[collection_name]
            # 取出的事件满足条件:issue建立于2024年3月之前；事件类型符合，事件发生在2024年3月之前
            events = collection.find({"issue_number": {"$in":list(issue_set[name])+list(pr_set[name])},
                                      "created_at": {"$lt": "2024-03-01T00:00:00Z"},
                                      'event':'assigned'},{'actor': 1,'issue_number':1,'created_at':1, 'assignee':1, '_id': 0})
            #存在部分事件对象信息缺失
            for event in events:
                try:
                    tmp.append({
                        "assigner":event['actor']['id'],
                        "assignee":event['assignee']['id'],
                        "time":event['created_at'],
                        "issue":name +"+"+ str(event['issue_number'])
                    })
                    cnt += 1
                    if not cnt % size:
                        with open('./json/relationship/'+'assigned'+ str(cnt //size) + '.json', 'w') as f:
                            json.dump(tmp, f, indent=4)
                        tmp = []
                except Exception as e:
                    pass
    # numpy及其fork的上下游
    with open('./numpy_refer_repo_name(addNumpyFork).txt', 'r', encoding='utf-8', newline='') as f:
        repo_list = set(ast.literal_eval(f.read()))
    db = client['reference_timeline']
    for collection_name in db.list_collection_names():
        # 删除数据库中非上下游的仓库的信息
        if collection_name.replace('_', '/', 1) not in repo_list:
            continue
        collection = db[collection_name]
        events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
                                  "created_at": {"$lt": "2024-03-01T00:00:00Z"},
                                  'event':'assigned'},{'actor': 1,'issue_number':1,'created_at':1, 'assignee':1, '_id': 0})
        for event in events:
            try:
                tmp.append({
                    "assigner": event['actor']['id'],
                    "assignee": event['assignee']['id'],
                    "time": event['created_at'],
                    "issue": collection_name +"+"+  str(event['issue_number'])
                })
                cnt += 1
                if not cnt % size:
                    with open('./json/relationship/' + 'assigned' + str(cnt // size) + '.json', 'w') as f:
                        json.dump(tmp, f, indent=4)
                    tmp = []
            except Exception as e:
                pass
    # 上下游的fork
    db = client['fork_reference']
    for collection_name in db.list_collection_names():
        collection = db[collection_name]
        events = collection.find({"issue_number": {"$in": list(issue_set[collection_name])+list(pr_set[collection_name])},
                                  "created_at": {"$lt": "2024-03-01T00:00:00Z"},
                                  'event':'assigned'},{'actor': 1,'issue_number':1,'created_at':1, 'assignee':1, '_id': 0})
        for event in events:
            try:
                tmp.append({
                    "assigner": event['actor']['id'],
                    "assignee": event['assignee']['id'],
                    "time": event['created_at'],
                    "issue": collection_name +"+"+  str(event['issue_number'])
                })
                cnt += 1
                if not cnt % size:
                    with open('./json/relationship/' + 'assigned' + str(cnt // size) + '.json', 'w') as f:
                        json.dump(tmp, f, indent=4)
                    tmp = []
            except Exception as e:
                pass
    if tmp:
        with open('./json/relationship/'+'assigned' + str(cnt // size+1) + '.json', 'w') as f:
            json.dump(tmp, f, indent=4)
    print(cnt,'assigned')

def split_file():

    with open('./json/relationship/commented161.json','r',encoding='utf-8') as f1:
        data = json.load(f1)
    with open('./json/relationship/commented400.json', 'w') as f2:
        json.dump(data[0:27000], f2, indent=4)
    with open('./json/relationship/commented401.json', 'w') as f3:
        json.dump(data[27000:], f3, indent=4)

def test(issue_set,pr_set,size):

    cnt = 0
    repo_rely = collections.defaultdict(set)
    all_repos = {i for i in issue_set.keys() if issue_set[i]}.union({i for i in pr_set.keys() if pr_set[i]})
    # 数据库链接
    client = pymongo.MongoClient("mongodb://localhost:27017/")
    # numpy及其fork
    db = client['numpy_db']
    for collection_name in db.list_collection_names():
        if '_timeline' in collection_name:
            name = collection_name[:collection_name.find('_timeline')]
            collection = db[collection_name]
            # 取出的事件满足条件:issue建立于2024年3月之前；事件类型符合，事件发生在2024年3月之前
            events = collection.find({"issue_number": {"$in":list(issue_set[name])+list(pr_set[name])},
                                      "created_at": {"$lt": "2024-03-01T00:00:00Z"},
                                      'event':'cross-referenced'},{'source': 1,'issue_number':1,'created_at':1,'actor':1,'_id': 0})
            #存在部分事件对象信息缺失
            for event in events:
                try:
                    repo = event['source']['issue']['repository_url'][event['source']['issue']['repository_url'].rfind('/',0,event['source']['issue']['repository_url'].rfind('/'))+1:].replace('/','_')
                    #筛掉repo节点外的关系
                    if repo not in all_repos:
                        continue
                    repo_rely[repo].add(name)
                    if repo == 'bioconda_bioconda-recipes':
                        print({
                        "from":repo+"+"+ str(event['source']['issue']['number']),
                        "to":name +"+"+  str(event['issue_number']),
                        "actor":event['actor']['id'],
                        "time":event['created_at']
                    })
                        print(cnt)
                    cnt += 1

                except Exception as e:
                    pass
    print(cnt)

if __name__ == "__main__":

    size = 200000
    # get_fork_json()
    # get_belongto_json(size)
    issue_set,pr_set,user_set = get_issues_and_users()
    # get_mentionedin_closed_json(issue_set,pr_set,'mentioned','actor','mentionedin',size)
    # get_mentionedin_closed_json(issue_set,pr_set,'closed','actor','closed',size)
    # get_opened_json(size)
    # get_commented_json(issue_set, pr_set,50000)
    # get_link_depend_json(issue_set, pr_set,size)
    # get_labeled_json(issue_set, pr_set,size)
    # get_assigned_json(issue_set, pr_set,size)

    # split_file()
    test(issue_set, pr_set, size)