tld_research/tld/code/preprocess/extract_jira_data.py

318 lines
8.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import sys
sys.path.append("..") # 添加util所在目录
import time
import json
import logging
import argparse
from pathlib import Path
from pymongo import MongoClient
from util import logging_init
DB = "JiraRepos"
REPOS = [
"Apache",
"Hyperledger",
"IntelDAOS",
"JFrog",
"Jira",
"JiraEcosystem",
"MariaDB",
"Mindville",
"Mojang",
"MongoDB",
"Qt",
"RedHat",
"Sakai",
"SecondLife",
"Sonatype",
"Spring",
]
EPIC_FIELD = {
"Apache": "customfield_12311120",
"Hyperledger": "customfield_10006",
"IntelDAOS": "customfield_10092",
"JFrog": "customfield_10806",
"Jira": "customfield_12931",
"JiraEcosystem": "customfield_12180",
"MariaDB": "customfield_10600",
"MindVille": "customfield_10000",
"MongoDB": "customfield_10857",
"Qt": "customfield_10400",
"Redhat": "customfield_12311140",
"Sakai": "customfield_10772",
"SecondLife": "customfield_10871",
"Sonatype": "customfield_11500",
"Spring": "customfield_10680",
}
ISSUE_DIR = Path("../../data/raw/issues")
ISSUE_DIR.mkdir(parents=True, exist_ok=True)
LINK_DIR = Path("../../data/raw/links")
LINK_DIR.mkdir(parents=True, exist_ok=True)
def extract_issues(db, repo: str):
"""提取Jira issues数据"""
collection = db[repo]
cursor = collection.find({}) # 查询所有document
issues = []
issue_ct, comment_ct = 0, 0 # issue和comment总数
for document in cursor:
try:
issue_key = document["key"]
try:
issuetype = document["fields"]["issuetype"]["name"]
except Exception:
issuetype = "None"
try:
status = document["fields"]["status"]["name"] # 状态
except Exception:
status = "None"
try:
priority = document["fields"]["priority"]["name"] # 优先级
except Exception:
priority = "None"
project = document["fields"]["project"]["name"]
try:
resolution = document["fields"]["resolution"]["name"] # 解决与否
except Exception:
resolution = "Open"
try:
component_arr = document["fields"]["components"]
components = []
for item in component_arr:
components.append(item["name"])
except Exception:
components = []
try:
created = document["fields"]["created"]
except Exception:
created = "None"
try:
updated = document["fields"]["updated"]
except Exception:
updated = "None"
try:
summary = document["fields"]["summary"]
except Exception:
summary = " "
try:
description = document["fields"]["description"]
except Exception:
description = " "
try:
comment_arr = document["fields"]["comments"]
comments = []
for item in comment_arr:
comments.append(item["body"])
comment_ct += 1
except:
comments = []
issue = {
"issue_key": issue_key,
"type": issuetype,
"status": status,
"priority": priority,
"resolution": resolution,
"project": project,
"created": created,
"updated": updated,
"title": summary,
"description": description,
"comments": comments,
"components": components,
}
issues.append(issue)
issue_ct += 1
except Exception:
pass
file_name = ISSUE_DIR / (repo + ".json")
with open(file_name, "w", errors="surrogatepass", encoding="utf-8") as f:
json.dump(issues, f, ensure_ascii=False, indent=2)
logging.info(
f"Extracted {issue_ct} issues, {comment_ct} comments from {repo} repo."
)
def extract_links(db, repo: str):
"""提取Jira issue links数据"""
collection = db[repo]
cursor = collection.find({}) # 查询所有document
links = []
for document in cursor:
try:
issue_key = document["key"]
issuelinks = document["fields"]["issuelinks"]
# 保存通用类型links
for issue_link in issuelinks:
type = issue_link["type"]["name"]
try:
in_issue_key = issue_key
out_issue_key = issue_link["outwardIssue"]["key"]
except Exception:
out_issue_key = issue_key
in_issue_key = issue_link["inwardIssue"]["key"]
link_key = in_issue_key + "_" + out_issue_key
link = {
"link_key": link_key,
"type": type,
"in_issue_key": in_issue_key,
"out_issue_key": out_issue_key,
}
links.append(link)
# 单独处理Subtask类型links
subtasks = document["fields"]["subtasks"]
for subtask in subtasks:
type = "Subtask"
in_issue_key = issue_key
out_issue_key = subtask["key"]
link_key = in_issue_key + "_" + out_issue_key
link = {
"link_key": link_key,
"type": type,
"in_issue_key": in_issue_key,
"out_issue_key": out_issue_key,
}
links.append(link)
# 单独处理Epic类型links
try:
epic = document["fields"][EPIC_FIELD[repo]]
type = "Epic-Relation"
in_issue_key = issue_key
out_issue_key = epic
link_key = in_issue_key + "_" + out_issue_key
link = {
"link_key": link_key,
"type": type,
"in_issue_key": in_issue_key,
"out_issue_key": out_issue_key,
}
links.append(link)
except Exception:
pass
# 单独处理RedHat Repo中的特殊类型links
if repo == "RedHat":
try:
parent = document["fields"]["customfield_12313140"]
type = "Parent-Relation"
in_issue_key = issue_key
out_issue_key = parent
link_key = in_issue_key + "_" + out_issue_key
link = {
"link_key": link_key,
"type": type,
"in_issue_key": in_issue_key,
"out_issue_key": out_issue_key,
}
links.append(link)
except Exception:
pass
try:
feature = document["fields"]["customfield_12318341"]
type = "Feature-Relation"
in_issue_key = issue_key
out_issue_key = feature
link_key = in_issue_key + "_" + out_issue_key
link = {
"link_key": link_key,
"type": type,
"in_issue_key": in_issue_key,
"out_issue_key": out_issue_key,
}
links.append(link)
except Exception:
pass
except Exception:
pass
file_name = LINK_DIR / (repo + ".json")
with open(file_name, "w", errors="surrogatepass", encoding="utf-8") as f:
json.dump(links, f, ensure_ascii=False, indent=2)
logging.info(f"Extracted links from {repo} repo. done!")
if __name__ == "__main__":
# 解析命令行参数
parser = argparse.ArgumentParser(description="Extract Jira issues data")
parser.add_argument("--host", default="localhost")
parser.add_argument("--port", type=int, default=27017)
parser.add_argument("--username", default=None)
parser.add_argument("--password", default=None)
args = parser.parse_args()
logging_init(log_filename="extract_jira_data", log_dir="../../log/preprocess")
start_time = time.perf_counter()
# 创建连接访问MongoDB
with MongoClient(
host=args.host,
port=args.port,
username=args.username,
password=args.password,
serverSelectionTimeoutMS=5000,
) as client:
db = client[DB]
for repo in REPOS:
extract_issues(db, repo)
extract_links(db, repo)
logging.info("=" * 20)
end_time = time.perf_counter()
logging.info(f"Time cost: {end_time - start_time:.3f}s")