删除无用文件

2024-03-23 16:42:35 +08:00 · 2024-03-23 16:42:35 +08:00 · 3794e4274e
parent 6d8ab9c802
commit 3794e4274e
3 changed files with 0 additions and 362 deletions
--- a/tld/code/preprocess/extract_jira_data.py
+++ b/tld/code/preprocess/extract_jira_data.py
@ -1,317 +0,0 @@
 import sys
 sys.path.append("..")  # 添加util所在目录
 import time
 import json
 import logging
 import argparse
 from pathlib import Path
 from pymongo import MongoClient
 from util import logging_init
 DB = "JiraRepos"
 REPOS = [
    "Apache",
    "Hyperledger",
    "IntelDAOS",
    "JFrog",
    "Jira",
    "JiraEcosystem",
    "MariaDB",
    "Mindville",
    "Mojang",
    "MongoDB",
    "Qt",
    "RedHat",
    "Sakai",
    "SecondLife",
    "Sonatype",
    "Spring",
 ]
 EPIC_FIELD = {
    "Apache": "customfield_12311120",
    "Hyperledger": "customfield_10006",
    "IntelDAOS": "customfield_10092",
    "JFrog": "customfield_10806",
    "Jira": "customfield_12931",
    "JiraEcosystem": "customfield_12180",
    "MariaDB": "customfield_10600",
    "MindVille": "customfield_10000",
    "MongoDB": "customfield_10857",
    "Qt": "customfield_10400",
    "Redhat": "customfield_12311140",
    "Sakai": "customfield_10772",
    "SecondLife": "customfield_10871",
    "Sonatype": "customfield_11500",
    "Spring": "customfield_10680",
 }
 ISSUE_DIR = Path("../../data/raw/issues")
 ISSUE_DIR.mkdir(parents=True, exist_ok=True)
 LINK_DIR = Path("../../data/raw/links")
 LINK_DIR.mkdir(parents=True, exist_ok=True)
 def extract_issues(db, repo: str):
    """提取Jira issues数据"""
    collection = db[repo]
    cursor = collection.find({})  # 查询所有document
    issues = []
    issue_ct, comment_ct = 0, 0  # issue和comment总数
    for document in cursor:
        try:
            issue_key = document["key"]
            try:
                issuetype = document["fields"]["issuetype"]["name"]
            except Exception:
                issuetype = "None"
            try:
                status = document["fields"]["status"]["name"]  # 状态
            except Exception:
                status = "None"
            try:
                priority = document["fields"]["priority"]["name"]  # 优先级
            except Exception:
                priority = "None"
            project = document["fields"]["project"]["name"]
            try:
                resolution = document["fields"]["resolution"]["name"]  # 解决与否
            except Exception:
                resolution = "Open"
            try:
                component_arr = document["fields"]["components"]
                components = []
                for item in component_arr:
                    components.append(item["name"])
            except Exception:
                components = []
            try:
                created = document["fields"]["created"]
            except Exception:
                created = "None"
            try:
                updated = document["fields"]["updated"]
            except Exception:
                updated = "None"
            try:
                summary = document["fields"]["summary"]
            except Exception:
                summary = " "
            try:
                description = document["fields"]["description"]
            except Exception:
                description = " "
            try:
                comment_arr = document["fields"]["comments"]
                comments = []
                for item in comment_arr:
                    comments.append(item["body"])
                    comment_ct += 1
            except:
                comments = []
            issue = {
                "issue_key": issue_key,
                "type": issuetype,
                "status": status,
                "priority": priority,
                "resolution": resolution,
                "project": project,
                "created": created,
                "updated": updated,
                "title": summary,
                "description": description,
                "comments": comments,
                "components": components,
            }
            issues.append(issue)
            issue_ct += 1
        except Exception:
            pass
    file_name = ISSUE_DIR / (repo + ".json")
    with open(file_name, "w", errors="surrogatepass", encoding="utf-8") as f:
        json.dump(issues, f, ensure_ascii=False, indent=2)
    logging.info(
        f"Extracted {issue_ct} issues, {comment_ct} comments from {repo} repo."
    )
 def extract_links(db, repo: str):
    """提取Jira issue links数据"""
    collection = db[repo]
    cursor = collection.find({})  # 查询所有document
    links = []
    for document in cursor:
        try:
            issue_key = document["key"]
            issuelinks = document["fields"]["issuelinks"]
            # 保存通用类型links
            for issue_link in issuelinks:
                type = issue_link["type"]["name"]
                try:
                    in_issue_key = issue_key
                    out_issue_key = issue_link["outwardIssue"]["key"]
                except Exception:
                    out_issue_key = issue_key
                    in_issue_key = issue_link["inwardIssue"]["key"]
                link_key = in_issue_key + "_" + out_issue_key
                link = {
                    "link_key": link_key,
                    "type": type,
                    "in_issue_key": in_issue_key,
                    "out_issue_key": out_issue_key,
                }
                links.append(link)
            # 单独处理Subtask类型links
            subtasks = document["fields"]["subtasks"]
            for subtask in subtasks:
                type = "Subtask"
                in_issue_key = issue_key
                out_issue_key = subtask["key"]
                link_key = in_issue_key + "_" + out_issue_key
                link = {
                    "link_key": link_key,
                    "type": type,
                    "in_issue_key": in_issue_key,
                    "out_issue_key": out_issue_key,
                }
                links.append(link)
            # 单独处理Epic类型links
            try:
                epic = document["fields"][EPIC_FIELD[repo]]
                type = "Epic-Relation"
                in_issue_key = issue_key
                out_issue_key = epic
                link_key = in_issue_key + "_" + out_issue_key
                link = {
                    "link_key": link_key,
                    "type": type,
                    "in_issue_key": in_issue_key,
                    "out_issue_key": out_issue_key,
                }
                links.append(link)
            except Exception:
                pass
            # 单独处理RedHat Repo中的特殊类型links
            if repo == "RedHat":
                try:
                    parent = document["fields"]["customfield_12313140"]
                    type = "Parent-Relation"
                    in_issue_key = issue_key
                    out_issue_key = parent
                    link_key = in_issue_key + "_" + out_issue_key
                    link = {
                        "link_key": link_key,
                        "type": type,
                        "in_issue_key": in_issue_key,
                        "out_issue_key": out_issue_key,
                    }
                    links.append(link)
                except Exception:
                    pass
                try:
                    feature = document["fields"]["customfield_12318341"]
                    type = "Feature-Relation"
                    in_issue_key = issue_key
                    out_issue_key = feature
                    link_key = in_issue_key + "_" + out_issue_key
                    link = {
                        "link_key": link_key,
                        "type": type,
                        "in_issue_key": in_issue_key,
                        "out_issue_key": out_issue_key,
                    }
                    links.append(link)
                except Exception:
                    pass
        except Exception:
            pass
    file_name = LINK_DIR / (repo + ".json")
    with open(file_name, "w", errors="surrogatepass", encoding="utf-8") as f:
        json.dump(links, f, ensure_ascii=False, indent=2)
    logging.info(f"Extracted links from {repo} repo. done!")
 if __name__ == "__main__":
    # 解析命令行参数
    parser = argparse.ArgumentParser(description="Extract Jira issues data")
    parser.add_argument("--host", default="localhost")
    parser.add_argument("--port", type=int, default=27017)
    parser.add_argument("--username", default=None)
    parser.add_argument("--password", default=None)
    args = parser.parse_args()
    logging_init(log_filename="extract_jira_data", log_dir="../../log/preprocess")
    start_time = time.perf_counter()
    # 创建连接，访问MongoDB
    with MongoClient(
        host=args.host,
        port=args.port,
        username=args.username,
        password=args.password,
        serverSelectionTimeoutMS=5000,
    ) as client:
        db = client[DB]
        for repo in REPOS:
            extract_issues(db, repo)
            extract_links(db, repo)
            logging.info("=" * 20)
    end_time = time.perf_counter()
    logging.info(f"Time cost: {end_time - start_time:.3f}s")
--- a/tld/code/util/init.py
+++ b/tld/code/util/init.py
@ -1 +0,0 @@
 from .log_helper import logging_init
--- a/tld/code/util/log_helper.py
+++ b/tld/code/util/log_helper.py
@ -1,44 +0,0 @@
 import sys
 import logging
 from pathlib import Path
 from datetime import datetime
 def logging_init(
    log_filename="monitor", log_level=logging.INFO, log_dir="./log/", only_file=False
 ):
    """
    初始化日志系统
    :param log_filename: 日志文件名
    :param log_level: 日志等级
    :param log_dir: 日志目录
    :parma only_file: 是否只保存到日志文件中
    """
    # 指定日志文件路径
    log_dir = Path(log_dir)
    log_dir.mkdir(parents=True, exist_ok=True)
    log_filepath = log_dir / (log_filename + "_" + str(datetime.now())[:10] + ".txt")
    # 指定日志格式
    format = "[%(asctime)s] - %(levelname)s: %(message)s"
    # 只保存到日志文件中
    if only_file:
        logging.basicConfig(
            filename=log_filepath,
            level=log_level,
            format=format,
            datefmt="%Y-%m-%d %H:%M:%S",
        )
    # 保存到日志文件并输出到终端
    else:
        logging.basicConfig(
            level=log_level,
            format=format,
            datefmt="%Y-%m-%d %H:%M:%S",
            handlers=[
                logging.FileHandler(log_filepath),
                logging.StreamHandler(sys.stdout),
            ],
        )