tld_research/tld/code/preprocess/extract_jira_data.py

import sys

sys.path.append("..")  # 添加util所在目录

import time
import json
import logging
import argparse
from pathlib import Path
from pymongo import MongoClient
from util import logging_init


DB = "JiraRepos"

REPOS = [
    "Apache",
    "Hyperledger",
    "IntelDAOS",
    "JFrog",
    "Jira",
    "JiraEcosystem",
    "MariaDB",
    "Mindville",
    "Mojang",
    "MongoDB",
    "Qt",
    "RedHat",
    "Sakai",
    "SecondLife",
    "Sonatype",
    "Spring",
]

EPIC_FIELD = {
    "Apache": "customfield_12311120",
    "Hyperledger": "customfield_10006",
    "IntelDAOS": "customfield_10092",
    "JFrog": "customfield_10806",
    "Jira": "customfield_12931",
    "JiraEcosystem": "customfield_12180",
    "MariaDB": "customfield_10600",
    "MindVille": "customfield_10000",
    "MongoDB": "customfield_10857",
    "Qt": "customfield_10400",
    "Redhat": "customfield_12311140",
    "Sakai": "customfield_10772",
    "SecondLife": "customfield_10871",
    "Sonatype": "customfield_11500",
    "Spring": "customfield_10680",
}

ISSUE_DIR = Path("../../data/raw/issues")
ISSUE_DIR.mkdir(parents=True, exist_ok=True)
LINK_DIR = Path("../../data/raw/links")
LINK_DIR.mkdir(parents=True, exist_ok=True)


def extract_issues(db, repo: str):
    """提取Jira issues数据"""

    collection = db[repo]
    cursor = collection.find({})  # 查询所有document

    issues = []
    issue_ct, comment_ct = 0, 0  # issue和comment总数

    for document in cursor:
        try:
            issue_key = document["key"]

            try:
                issuetype = document["fields"]["issuetype"]["name"]
            except Exception:
                issuetype = "None"

            try:
                status = document["fields"]["status"]["name"]  # 状态
            except Exception:
                status = "None"

            try:
                priority = document["fields"]["priority"]["name"]  # 优先级
            except Exception:
                priority = "None"

            project = document["fields"]["project"]["name"]

            try:
                resolution = document["fields"]["resolution"]["name"]  # 解决与否
            except Exception:
                resolution = "Open"

            try:
                component_arr = document["fields"]["components"]
                components = []
                for item in component_arr:
                    components.append(item["name"])
            except Exception:
                components = []

            try:
                created = document["fields"]["created"]
            except Exception:
                created = "None"

            try:
                updated = document["fields"]["updated"]
            except Exception:
                updated = "None"

            try:
                summary = document["fields"]["summary"]
            except Exception:
                summary = " "

            try:
                description = document["fields"]["description"]
            except Exception:
                description = " "

            try:
                comment_arr = document["fields"]["comments"]
                comments = []
                for item in comment_arr:
                    comments.append(item["body"])
                    comment_ct += 1
            except:
                comments = []

            issue = {
                "issue_key": issue_key,
                "type": issuetype,
                "status": status,
                "priority": priority,
                "resolution": resolution,
                "project": project,
                "created": created,
                "updated": updated,
                "title": summary,
                "description": description,
                "comments": comments,
                "components": components,
            }

            issues.append(issue)
            issue_ct += 1

        except Exception:
            pass

    file_name = ISSUE_DIR / (repo + ".json")
    with open(file_name, "w", errors="surrogatepass", encoding="utf-8") as f:
        json.dump(issues, f, ensure_ascii=False, indent=2)

    logging.info(
        f"Extracted {issue_ct} issues, {comment_ct} comments from {repo} repo."
    )


def extract_links(db, repo: str):
    """提取Jira issue links数据"""

    collection = db[repo]
    cursor = collection.find({})  # 查询所有document

    links = []

    for document in cursor:
        try:
            issue_key = document["key"]

            issuelinks = document["fields"]["issuelinks"]

            # 保存通用类型links
            for issue_link in issuelinks:
                type = issue_link["type"]["name"]

                try:
                    in_issue_key = issue_key
                    out_issue_key = issue_link["outwardIssue"]["key"]
                except Exception:
                    out_issue_key = issue_key
                    in_issue_key = issue_link["inwardIssue"]["key"]

                link_key = in_issue_key + "_" + out_issue_key

                link = {
                    "link_key": link_key,
                    "type": type,
                    "in_issue_key": in_issue_key,
                    "out_issue_key": out_issue_key,
                }

                links.append(link)

            # 单独处理Subtask类型links
            subtasks = document["fields"]["subtasks"]
            for subtask in subtasks:
                type = "Subtask"
                in_issue_key = issue_key
                out_issue_key = subtask["key"]

                link_key = in_issue_key + "_" + out_issue_key

                link = {
                    "link_key": link_key,
                    "type": type,
                    "in_issue_key": in_issue_key,
                    "out_issue_key": out_issue_key,
                }

                links.append(link)

            # 单独处理Epic类型links
            try:
                epic = document["fields"][EPIC_FIELD[repo]]
                type = "Epic-Relation"
                in_issue_key = issue_key
                out_issue_key = epic

                link_key = in_issue_key + "_" + out_issue_key

                link = {
                    "link_key": link_key,
                    "type": type,
                    "in_issue_key": in_issue_key,
                    "out_issue_key": out_issue_key,
                }

                links.append(link)

            except Exception:
                pass

            # 单独处理RedHat Repo中的特殊类型links
            if repo == "RedHat":
                try:
                    parent = document["fields"]["customfield_12313140"]
                    type = "Parent-Relation"
                    in_issue_key = issue_key
                    out_issue_key = parent

                    link_key = in_issue_key + "_" + out_issue_key

                    link = {
                        "link_key": link_key,
                        "type": type,
                        "in_issue_key": in_issue_key,
                        "out_issue_key": out_issue_key,
                    }

                    links.append(link)

                except Exception:
                    pass

                try:
                    feature = document["fields"]["customfield_12318341"]
                    type = "Feature-Relation"
                    in_issue_key = issue_key
                    out_issue_key = feature

                    link_key = in_issue_key + "_" + out_issue_key

                    link = {
                        "link_key": link_key,
                        "type": type,
                        "in_issue_key": in_issue_key,
                        "out_issue_key": out_issue_key,
                    }

                    links.append(link)

                except Exception:
                    pass

        except Exception:
            pass

    file_name = LINK_DIR / (repo + ".json")
    with open(file_name, "w", errors="surrogatepass", encoding="utf-8") as f:
        json.dump(links, f, ensure_ascii=False, indent=2)

    logging.info(f"Extracted links from {repo} repo. done!")


if __name__ == "__main__":
    # 解析命令行参数
    parser = argparse.ArgumentParser(description="Extract Jira issues data")
    parser.add_argument("--host", default="localhost")
    parser.add_argument("--port", type=int, default=27017)
    parser.add_argument("--username", default=None)
    parser.add_argument("--password", default=None)
    args = parser.parse_args()

    logging_init(log_filename="extract_jira_data", log_dir="../../log/preprocess")

    start_time = time.perf_counter()
    # 创建连接，访问MongoDB
    with MongoClient(
        host=args.host,
        port=args.port,
        username=args.username,
        password=args.password,
        serverSelectionTimeoutMS=5000,
    ) as client:
        db = client[DB]

        for repo in REPOS:
            extract_issues(db, repo)
            extract_links(db, repo)
            logging.info("=" * 20)

    end_time = time.perf_counter()

    logging.info(f"Time cost: {end_time - start_time:.3f}s")