318 lines
8.9 KiB
Python
318 lines
8.9 KiB
Python
import sys
|
||
|
||
sys.path.append("..") # 添加util所在目录
|
||
|
||
import time
|
||
import json
|
||
import logging
|
||
import argparse
|
||
from pathlib import Path
|
||
from pymongo import MongoClient
|
||
from util import logging_init
|
||
|
||
|
||
DB = "JiraRepos"
|
||
|
||
REPOS = [
|
||
"Apache",
|
||
"Hyperledger",
|
||
"IntelDAOS",
|
||
"JFrog",
|
||
"Jira",
|
||
"JiraEcosystem",
|
||
"MariaDB",
|
||
"Mindville",
|
||
"Mojang",
|
||
"MongoDB",
|
||
"Qt",
|
||
"RedHat",
|
||
"Sakai",
|
||
"SecondLife",
|
||
"Sonatype",
|
||
"Spring",
|
||
]
|
||
|
||
EPIC_FIELD = {
|
||
"Apache": "customfield_12311120",
|
||
"Hyperledger": "customfield_10006",
|
||
"IntelDAOS": "customfield_10092",
|
||
"JFrog": "customfield_10806",
|
||
"Jira": "customfield_12931",
|
||
"JiraEcosystem": "customfield_12180",
|
||
"MariaDB": "customfield_10600",
|
||
"MindVille": "customfield_10000",
|
||
"MongoDB": "customfield_10857",
|
||
"Qt": "customfield_10400",
|
||
"Redhat": "customfield_12311140",
|
||
"Sakai": "customfield_10772",
|
||
"SecondLife": "customfield_10871",
|
||
"Sonatype": "customfield_11500",
|
||
"Spring": "customfield_10680",
|
||
}
|
||
|
||
ISSUE_DIR = Path("../../data/raw/issues")
|
||
ISSUE_DIR.mkdir(parents=True, exist_ok=True)
|
||
LINK_DIR = Path("../../data/raw/links")
|
||
LINK_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
|
||
def extract_issues(db, repo: str):
|
||
"""提取Jira issues数据"""
|
||
|
||
collection = db[repo]
|
||
cursor = collection.find({}) # 查询所有document
|
||
|
||
issues = []
|
||
issue_ct, comment_ct = 0, 0 # issue和comment总数
|
||
|
||
for document in cursor:
|
||
try:
|
||
issue_key = document["key"]
|
||
|
||
try:
|
||
issuetype = document["fields"]["issuetype"]["name"]
|
||
except Exception:
|
||
issuetype = "None"
|
||
|
||
try:
|
||
status = document["fields"]["status"]["name"] # 状态
|
||
except Exception:
|
||
status = "None"
|
||
|
||
try:
|
||
priority = document["fields"]["priority"]["name"] # 优先级
|
||
except Exception:
|
||
priority = "None"
|
||
|
||
project = document["fields"]["project"]["name"]
|
||
|
||
try:
|
||
resolution = document["fields"]["resolution"]["name"] # 解决与否
|
||
except Exception:
|
||
resolution = "Open"
|
||
|
||
try:
|
||
component_arr = document["fields"]["components"]
|
||
components = []
|
||
for item in component_arr:
|
||
components.append(item["name"])
|
||
except Exception:
|
||
components = []
|
||
|
||
try:
|
||
created = document["fields"]["created"]
|
||
except Exception:
|
||
created = "None"
|
||
|
||
try:
|
||
updated = document["fields"]["updated"]
|
||
except Exception:
|
||
updated = "None"
|
||
|
||
try:
|
||
summary = document["fields"]["summary"]
|
||
except Exception:
|
||
summary = " "
|
||
|
||
try:
|
||
description = document["fields"]["description"]
|
||
except Exception:
|
||
description = " "
|
||
|
||
try:
|
||
comment_arr = document["fields"]["comments"]
|
||
comments = []
|
||
for item in comment_arr:
|
||
comments.append(item["body"])
|
||
comment_ct += 1
|
||
except:
|
||
comments = []
|
||
|
||
issue = {
|
||
"issue_key": issue_key,
|
||
"type": issuetype,
|
||
"status": status,
|
||
"priority": priority,
|
||
"resolution": resolution,
|
||
"project": project,
|
||
"created": created,
|
||
"updated": updated,
|
||
"title": summary,
|
||
"description": description,
|
||
"comments": comments,
|
||
"components": components,
|
||
}
|
||
|
||
issues.append(issue)
|
||
issue_ct += 1
|
||
|
||
except Exception:
|
||
pass
|
||
|
||
file_name = ISSUE_DIR / (repo + ".json")
|
||
with open(file_name, "w", errors="surrogatepass", encoding="utf-8") as f:
|
||
json.dump(issues, f, ensure_ascii=False, indent=2)
|
||
|
||
logging.info(
|
||
f"Extracted {issue_ct} issues, {comment_ct} comments from {repo} repo."
|
||
)
|
||
|
||
|
||
def extract_links(db, repo: str):
|
||
"""提取Jira issue links数据"""
|
||
|
||
collection = db[repo]
|
||
cursor = collection.find({}) # 查询所有document
|
||
|
||
links = []
|
||
|
||
for document in cursor:
|
||
try:
|
||
issue_key = document["key"]
|
||
|
||
issuelinks = document["fields"]["issuelinks"]
|
||
|
||
# 保存通用类型links
|
||
for issue_link in issuelinks:
|
||
type = issue_link["type"]["name"]
|
||
|
||
try:
|
||
in_issue_key = issue_key
|
||
out_issue_key = issue_link["outwardIssue"]["key"]
|
||
except Exception:
|
||
out_issue_key = issue_key
|
||
in_issue_key = issue_link["inwardIssue"]["key"]
|
||
|
||
link_key = in_issue_key + "_" + out_issue_key
|
||
|
||
link = {
|
||
"link_key": link_key,
|
||
"type": type,
|
||
"in_issue_key": in_issue_key,
|
||
"out_issue_key": out_issue_key,
|
||
}
|
||
|
||
links.append(link)
|
||
|
||
# 单独处理Subtask类型links
|
||
subtasks = document["fields"]["subtasks"]
|
||
for subtask in subtasks:
|
||
type = "Subtask"
|
||
in_issue_key = issue_key
|
||
out_issue_key = subtask["key"]
|
||
|
||
link_key = in_issue_key + "_" + out_issue_key
|
||
|
||
link = {
|
||
"link_key": link_key,
|
||
"type": type,
|
||
"in_issue_key": in_issue_key,
|
||
"out_issue_key": out_issue_key,
|
||
}
|
||
|
||
links.append(link)
|
||
|
||
# 单独处理Epic类型links
|
||
try:
|
||
epic = document["fields"][EPIC_FIELD[repo]]
|
||
type = "Epic-Relation"
|
||
in_issue_key = issue_key
|
||
out_issue_key = epic
|
||
|
||
link_key = in_issue_key + "_" + out_issue_key
|
||
|
||
link = {
|
||
"link_key": link_key,
|
||
"type": type,
|
||
"in_issue_key": in_issue_key,
|
||
"out_issue_key": out_issue_key,
|
||
}
|
||
|
||
links.append(link)
|
||
|
||
except Exception:
|
||
pass
|
||
|
||
# 单独处理RedHat Repo中的特殊类型links
|
||
if repo == "RedHat":
|
||
try:
|
||
parent = document["fields"]["customfield_12313140"]
|
||
type = "Parent-Relation"
|
||
in_issue_key = issue_key
|
||
out_issue_key = parent
|
||
|
||
link_key = in_issue_key + "_" + out_issue_key
|
||
|
||
link = {
|
||
"link_key": link_key,
|
||
"type": type,
|
||
"in_issue_key": in_issue_key,
|
||
"out_issue_key": out_issue_key,
|
||
}
|
||
|
||
links.append(link)
|
||
|
||
except Exception:
|
||
pass
|
||
|
||
try:
|
||
feature = document["fields"]["customfield_12318341"]
|
||
type = "Feature-Relation"
|
||
in_issue_key = issue_key
|
||
out_issue_key = feature
|
||
|
||
link_key = in_issue_key + "_" + out_issue_key
|
||
|
||
link = {
|
||
"link_key": link_key,
|
||
"type": type,
|
||
"in_issue_key": in_issue_key,
|
||
"out_issue_key": out_issue_key,
|
||
}
|
||
|
||
links.append(link)
|
||
|
||
except Exception:
|
||
pass
|
||
|
||
except Exception:
|
||
pass
|
||
|
||
file_name = LINK_DIR / (repo + ".json")
|
||
with open(file_name, "w", errors="surrogatepass", encoding="utf-8") as f:
|
||
json.dump(links, f, ensure_ascii=False, indent=2)
|
||
|
||
logging.info(f"Extracted links from {repo} repo. done!")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# 解析命令行参数
|
||
parser = argparse.ArgumentParser(description="Extract Jira issues data")
|
||
parser.add_argument("--host", default="localhost")
|
||
parser.add_argument("--port", type=int, default=27017)
|
||
parser.add_argument("--username", default=None)
|
||
parser.add_argument("--password", default=None)
|
||
args = parser.parse_args()
|
||
|
||
logging_init(log_filename="extract_jira_data", log_dir="../../log/preprocess")
|
||
|
||
start_time = time.perf_counter()
|
||
# 创建连接,访问MongoDB
|
||
with MongoClient(
|
||
host=args.host,
|
||
port=args.port,
|
||
username=args.username,
|
||
password=args.password,
|
||
serverSelectionTimeoutMS=5000,
|
||
) as client:
|
||
db = client[DB]
|
||
|
||
for repo in REPOS:
|
||
extract_issues(db, repo)
|
||
extract_links(db, repo)
|
||
logging.info("=" * 20)
|
||
|
||
end_time = time.perf_counter()
|
||
|
||
logging.info(f"Time cost: {end_time - start_time:.3f}s")
|