完成实证分析部分实验
This commit is contained in:
parent
cbf41e55be
commit
6d8ab9c802
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,341 @@
|
|||
{
|
||||
"Apache": {
|
||||
"Blocked": {
|
||||
"id": "12310361",
|
||||
"name": "Blocked",
|
||||
"inward": "Blocked",
|
||||
"outward": "Blocked",
|
||||
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310361"
|
||||
},
|
||||
"Blocker": {
|
||||
"id": "10032",
|
||||
"name": "Blocker",
|
||||
"inward": "is blocked by",
|
||||
"outward": "blocks",
|
||||
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/10032"
|
||||
},
|
||||
"Child-Issue": {
|
||||
"id": "12310460",
|
||||
"name": "Child-Issue",
|
||||
"inward": "is a child of",
|
||||
"outward": "is a parent of",
|
||||
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310460"
|
||||
},
|
||||
"Cloners": {
|
||||
"id": "10020",
|
||||
"name": "Cloners",
|
||||
"inward": "is cloned by",
|
||||
"outward": "is a clone of",
|
||||
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/10020"
|
||||
},
|
||||
"Completes": {
|
||||
"id": "12310660",
|
||||
"name": "Completes",
|
||||
"inward": "is fixed by",
|
||||
"outward": "fixes",
|
||||
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310660"
|
||||
},
|
||||
"Container": {
|
||||
"id": "12310060",
|
||||
"name": "Container",
|
||||
"inward": "Is contained by",
|
||||
"outward": "contains",
|
||||
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310060"
|
||||
},
|
||||
"Dependency": {
|
||||
"id": "12310461",
|
||||
"name": "Dependency",
|
||||
"inward": "Dependency",
|
||||
"outward": "Dependency",
|
||||
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310461"
|
||||
},
|
||||
"Dependent": {
|
||||
"id": "12310360",
|
||||
"name": "Dependent",
|
||||
"inward": "Dependent",
|
||||
"outward": "Dependent",
|
||||
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310360"
|
||||
},
|
||||
"Duplicate": {
|
||||
"id": "12310000",
|
||||
"name": "Duplicate",
|
||||
"inward": "is duplicated by",
|
||||
"outward": "duplicates",
|
||||
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310000"
|
||||
},
|
||||
"Incorporates": {
|
||||
"id": "12310010",
|
||||
"name": "Incorporates",
|
||||
"inward": "is part of",
|
||||
"outward": "incorporates",
|
||||
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310010"
|
||||
},
|
||||
"Issue split": {
|
||||
"id": "12310761",
|
||||
"name": "Issue split",
|
||||
"inward": "split from",
|
||||
"outward": "split to",
|
||||
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310761"
|
||||
},
|
||||
"Parent Feature": {
|
||||
"id": "12310462",
|
||||
"name": "Parent Feature",
|
||||
"inward": "Parent Feature",
|
||||
"outward": "Parent Feature",
|
||||
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310462"
|
||||
},
|
||||
"Problem/Incident": {
|
||||
"id": "12310560",
|
||||
"name": "Problem/Incident",
|
||||
"inward": "is caused by",
|
||||
"outward": "causes",
|
||||
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310560"
|
||||
},
|
||||
"Reference": {
|
||||
"id": "10030",
|
||||
"name": "Reference",
|
||||
"inward": "is related to",
|
||||
"outward": "relates to",
|
||||
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/10030"
|
||||
},
|
||||
"Regression": {
|
||||
"id": "12310050",
|
||||
"name": "Regression",
|
||||
"inward": "is broken by",
|
||||
"outward": "breaks",
|
||||
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310050"
|
||||
},
|
||||
"Related": {
|
||||
"id": "12310260",
|
||||
"name": "Related",
|
||||
"inward": "is related to",
|
||||
"outward": "relates to",
|
||||
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310260"
|
||||
},
|
||||
"Required": {
|
||||
"id": "12310040",
|
||||
"name": "Required",
|
||||
"inward": "is required by",
|
||||
"outward": "requires",
|
||||
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310040"
|
||||
},
|
||||
"Supercedes": {
|
||||
"id": "12310051",
|
||||
"name": "Supercedes",
|
||||
"inward": "is superceded by",
|
||||
"outward": "supercedes",
|
||||
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310051"
|
||||
},
|
||||
"Testing": {
|
||||
"id": "12310760",
|
||||
"name": "Testing",
|
||||
"inward": "Discovered while testing",
|
||||
"outward": "Testing discovered",
|
||||
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310760"
|
||||
},
|
||||
"dependent": {
|
||||
"id": "10001",
|
||||
"name": "dependent",
|
||||
"inward": "is depended upon by",
|
||||
"outward": "depends upon",
|
||||
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/10001"
|
||||
}
|
||||
},
|
||||
"Jira": {},
|
||||
"Mojang": {
|
||||
"Blocks": {
|
||||
"id": "10100",
|
||||
"name": "Blocks",
|
||||
"inward": "is blocked by",
|
||||
"outward": "blocks",
|
||||
"self": "https://bugs.mojang.com/rest/api/2/issueLinkType/10100"
|
||||
},
|
||||
"Bonfire Testing": {
|
||||
"id": "10000",
|
||||
"name": "Bonfire Testing",
|
||||
"inward": "discovered while testing",
|
||||
"outward": "testing discovered",
|
||||
"self": "https://bugs.mojang.com/rest/api/2/issueLinkType/10000"
|
||||
},
|
||||
"Cloners": {
|
||||
"id": "10101",
|
||||
"name": "Cloners",
|
||||
"inward": "is cloned by",
|
||||
"outward": "clones",
|
||||
"self": "https://bugs.mojang.com/rest/api/2/issueLinkType/10101"
|
||||
},
|
||||
"Duplicate": {
|
||||
"id": "10102",
|
||||
"name": "Duplicate",
|
||||
"inward": "is duplicated by",
|
||||
"outward": "duplicates",
|
||||
"self": "https://bugs.mojang.com/rest/api/2/issueLinkType/10102"
|
||||
},
|
||||
"Problem/Incident": {
|
||||
"id": "10500",
|
||||
"name": "Problem/Incident",
|
||||
"inward": "is caused by",
|
||||
"outward": "causes",
|
||||
"self": "https://bugs.mojang.com/rest/api/2/issueLinkType/10500"
|
||||
},
|
||||
"Relates": {
|
||||
"id": "10103",
|
||||
"name": "Relates",
|
||||
"inward": "relates to",
|
||||
"outward": "relates to",
|
||||
"self": "https://bugs.mojang.com/rest/api/2/issueLinkType/10103"
|
||||
}
|
||||
},
|
||||
"MongoDB": {},
|
||||
"Qt": {
|
||||
"Blocks": {
|
||||
"id": "10282",
|
||||
"name": "Blocks",
|
||||
"inward": "is blocked by",
|
||||
"outward": "blocks",
|
||||
"self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10282"
|
||||
},
|
||||
"Cloners": {
|
||||
"id": "10281",
|
||||
"name": "Cloners",
|
||||
"inward": "is cloned by",
|
||||
"outward": "clones",
|
||||
"self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10281"
|
||||
},
|
||||
"Covered": {
|
||||
"id": "10381",
|
||||
"name": "Covered",
|
||||
"inward": "is covered by",
|
||||
"outward": "covers",
|
||||
"self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10381"
|
||||
},
|
||||
"Dependency": {
|
||||
"id": "10001",
|
||||
"name": "Dependency",
|
||||
"inward": "is required for",
|
||||
"outward": "depends on",
|
||||
"self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10001"
|
||||
},
|
||||
"Duplicate": {
|
||||
"id": "10180",
|
||||
"name": "Duplicate",
|
||||
"inward": "is duplicated by",
|
||||
"outward": "duplicates",
|
||||
"self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10180"
|
||||
},
|
||||
"Issue split": {
|
||||
"id": "10280",
|
||||
"name": "Issue split",
|
||||
"inward": "split from",
|
||||
"outward": "split to",
|
||||
"self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10280"
|
||||
},
|
||||
"Relates": {
|
||||
"id": "10070",
|
||||
"name": "Relates",
|
||||
"inward": "relates to",
|
||||
"outward": "relates to",
|
||||
"self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10070"
|
||||
},
|
||||
"Replacement": {
|
||||
"id": "10031",
|
||||
"name": "Replacement",
|
||||
"inward": "replaces",
|
||||
"outward": "is replaced by",
|
||||
"self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10031"
|
||||
},
|
||||
"Test": {
|
||||
"id": "10020",
|
||||
"name": "Test",
|
||||
"inward": "Is tested by",
|
||||
"outward": "tests",
|
||||
"self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10020"
|
||||
},
|
||||
"Work Breakdown": {
|
||||
"id": "10040",
|
||||
"name": "Work Breakdown",
|
||||
"inward": "resulted from",
|
||||
"outward": "resulted in",
|
||||
"self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10040"
|
||||
}
|
||||
},
|
||||
"RedHat": {
|
||||
"Account": {
|
||||
"id": "12310920",
|
||||
"name": "Account",
|
||||
"inward": "account is impacted by",
|
||||
"outward": "impacts account",
|
||||
"self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310920"
|
||||
},
|
||||
"Blocks": {
|
||||
"id": "12310720",
|
||||
"name": "Blocks",
|
||||
"inward": "is blocked by",
|
||||
"outward": "blocks",
|
||||
"self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310720"
|
||||
},
|
||||
"Causality": {
|
||||
"id": "12310220",
|
||||
"name": "Causality",
|
||||
"inward": "is caused by",
|
||||
"outward": "causes",
|
||||
"self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310220"
|
||||
},
|
||||
"Cloners": {
|
||||
"id": "12310120",
|
||||
"name": "Cloners",
|
||||
"inward": "is cloned by",
|
||||
"outward": "clones",
|
||||
"self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310120"
|
||||
},
|
||||
"Depend": {
|
||||
"id": "12311220",
|
||||
"name": "Depend",
|
||||
"inward": "is depended on by",
|
||||
"outward": "depends on",
|
||||
"self": "https://issues.redhat.com/rest/api/2/issueLinkType/12311220"
|
||||
},
|
||||
"Document": {
|
||||
"id": "12310420",
|
||||
"name": "Document",
|
||||
"inward": "is documented by",
|
||||
"outward": "documents",
|
||||
"self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310420"
|
||||
},
|
||||
"Duplicate": {
|
||||
"id": "12310000",
|
||||
"name": "Duplicate",
|
||||
"inward": "is duplicated by",
|
||||
"outward": "duplicates",
|
||||
"self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310000"
|
||||
},
|
||||
"Incorporates": {
|
||||
"id": "10011",
|
||||
"name": "Incorporates",
|
||||
"inward": "is incorporated by",
|
||||
"outward": "incorporates",
|
||||
"self": "https://issues.redhat.com/rest/api/2/issueLinkType/10011"
|
||||
},
|
||||
"Issue split": {
|
||||
"id": "12311720",
|
||||
"name": "Issue split",
|
||||
"inward": "split from",
|
||||
"outward": "split to",
|
||||
"self": "https://issues.redhat.com/rest/api/2/issueLinkType/12311720"
|
||||
},
|
||||
"Related": {
|
||||
"id": "12310001",
|
||||
"name": "Related",
|
||||
"inward": "is related to",
|
||||
"outward": "relates to",
|
||||
"self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310001"
|
||||
},
|
||||
"Triggers": {
|
||||
"id": "12310723",
|
||||
"name": "Triggers",
|
||||
"inward": "is triggered by",
|
||||
"outward": "is triggering",
|
||||
"self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310723"
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
;Year;Issues;DIT;UIT;Links;DLT;ULT;UP;Changes;Ch/I;Comments;Co/I
|
||||
RedHat;2001.0;502297.0;79.0;64.0;268935.0;11.0;11.0;807.0;7197717.0;14.0;1115471.0;2.0
|
||||
Qt;2005.0;180574.0;15.0;15.0;58621.0;10.0;10.0;60.0;2307707.0;13.0;507214.0;3.0
|
||||
Sum;;682871.0;94.0;79.0;327556.0;21.0;21.0;867.0;9505424.0;27.0;1622685.0;5.0
|
||||
Mean;;341435.5;47.0;39.5;163778.0;10.5;10.5;433.5;4752712.0;13.5;811342.5;2.5
|
||||
Median;;341435.5;47.0;39.5;163778.0;10.5;10.5;433.5;4752712.0;13.5;811342.5;2.5
|
||||
Std Dev;;160861.5;32.0;24.5;105157.0;0.5;0.5;373.5;2445005.0;0.5;304128.5;0.5
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,734 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "e35d2209-3e5b-4cd7-a702-2eed1badf800",
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2022-01-25T09:42:06.183120Z",
|
||||
"iopub.status.busy": "2022-01-25T09:42:06.182949Z",
|
||||
"iopub.status.idle": "2022-01-25T09:42:06.839486Z",
|
||||
"shell.execute_reply": "2022-01-25T09:42:06.838906Z",
|
||||
"shell.execute_reply.started": "2022-01-25T09:42:06.183099Z"
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"from time import time\n",
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"from pymongo import MongoClient\n",
|
||||
"from statistics import mean, median\n",
|
||||
"\n",
|
||||
"# 确保DataFrame的列长不会被截断\n",
|
||||
"pd.set_option(\"display.max_colwidth\", None)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "cb035275-5360-43cc-8dec-e7d1df4c7417",
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2022-01-25T09:42:06.842313Z",
|
||||
"iopub.status.busy": "2022-01-25T09:42:06.841976Z",
|
||||
"iopub.status.idle": "2022-01-25T09:42:06.867490Z",
|
||||
"shell.execute_reply": "2022-01-25T09:42:06.866866Z",
|
||||
"shell.execute_reply.started": "2022-01-25T09:42:06.842270Z"
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# 加载Jira软件生态元数据\n",
|
||||
"with open(\"../data/jira_ecos_info.json\") as f:\n",
|
||||
" jira_ecos_info = json.load(f)\n",
|
||||
"\n",
|
||||
"# 加载生态中使用的Issue类型信息(使用`data_crawl.ipynb`下载)\n",
|
||||
"with open(\"../data/eco_issue_types.json\") as f:\n",
|
||||
" eco_issue_types = json.load(f)\n",
|
||||
"\n",
|
||||
"# 加载生态中使用的链接类型信息(使用`data_crawl.ipynb`下载)\n",
|
||||
"with open(\"../data/eco_link_types.json\") as f:\n",
|
||||
" eco_link_types = json.load(f)\n",
|
||||
"\n",
|
||||
"# 连接到数据库\n",
|
||||
"client = MongoClient()\n",
|
||||
"db = client[\"JiraEcos\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "07c7b0c3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# ALL_ECOS = [name for name in jira_eco_sources.keys()]\n",
|
||||
"ALL_ECOS = [\"RedHat\", \"Qt\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "6897e14d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# 格式化时间间隔\n",
|
||||
"def format_duration(start_time, end_time):\n",
|
||||
" # 计算总秒数\n",
|
||||
" seconds = end_time - start_time\n",
|
||||
" # 计算分钟和小时数\n",
|
||||
" minutes = int(seconds / 60)\n",
|
||||
" hours = int(minutes / 60)\n",
|
||||
" display_minutes = int(minutes % 60)\n",
|
||||
" display_seconds = int(seconds % 60)\n",
|
||||
"\n",
|
||||
" return f\"{hours:02}:{display_minutes:02}:{display_seconds:02}\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "d1f407e3-d97b-4125-9723-35b613b42534",
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2022-01-25T09:42:06.923021Z",
|
||||
"iopub.status.busy": "2022-01-25T09:42:06.922589Z",
|
||||
"iopub.status.idle": "2022-01-25T09:42:06.928681Z",
|
||||
"shell.execute_reply": "2022-01-25T09:42:06.927331Z",
|
||||
"shell.execute_reply.started": "2022-01-25T09:42:06.922989Z"
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ecos_df = pd.DataFrame(\n",
|
||||
" np.nan,\n",
|
||||
" columns=[\n",
|
||||
" \"Year\", # 生态创建时间\n",
|
||||
" \"Issues\", # Issue总数\n",
|
||||
" \"DIT\", # Documented Issue Types,登记的Issue类型数\n",
|
||||
" \"UIT\", # Used Issue Types,使用的Issue类型数\n",
|
||||
" \"Links\", # 链接总数\n",
|
||||
" \"DLT\", # Documented Link Types,登记的链接类型数\n",
|
||||
" \"ULT\", # Used Link Types,使用的链接类型数\n",
|
||||
" \"UP\", # Unique Projects,项目总数\n",
|
||||
" \"Changes\", # 更改总数\n",
|
||||
" \"Ch/I\", # Changes/Issues\n",
|
||||
" \"Comments\", # 评论总数\n",
|
||||
" \"Co/I\", # Comments/Issues\n",
|
||||
" ],\n",
|
||||
" index=ALL_ECOS + [\"Sum\", \"Mean\", \"Median\", \"Std Dev\"], # 总和、均值、中值、标准差\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "af17b5f7-adea-462a-a6bd-e0bf36290781",
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2022-01-25T09:42:07.611366Z",
|
||||
"iopub.status.busy": "2022-01-25T09:42:07.611062Z",
|
||||
"iopub.status.idle": "2022-01-25T11:06:32.931020Z",
|
||||
"shell.execute_reply": "2022-01-25T11:06:32.923594Z",
|
||||
"shell.execute_reply.started": "2022-01-25T09:42:07.611342Z"
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def populate_ecos_df(ecos_df, eco_names=ALL_ECOS):\n",
|
||||
" # 填充DataFrame\n",
|
||||
"\n",
|
||||
" def extract_number_of_issues(eco_name):\n",
|
||||
" # 查询Issue总数\n",
|
||||
" issues_collection = db[eco_name]\n",
|
||||
" num_issues = issues_collection.count_documents({})\n",
|
||||
"\n",
|
||||
" return num_issues\n",
|
||||
"\n",
|
||||
" def extract_number_of_doc_issuetypes(eco_name):\n",
|
||||
" # 查询记录的Issue类型数\n",
|
||||
" return len(eco_issue_types[eco_name])\n",
|
||||
"\n",
|
||||
" def extract_number_of_used_issuetypes(eco_name):\n",
|
||||
" # 查询在最后状态下的Issue类型\n",
|
||||
" issues_collection = db[eco_name]\n",
|
||||
" final_types_query = list(\n",
|
||||
" issues_collection.aggregate(\n",
|
||||
" [\n",
|
||||
" # 取出'$fields.issuetype.name'字段,并重命名\n",
|
||||
" {\n",
|
||||
" \"$project\": {\n",
|
||||
" \"_id\": 0,\n",
|
||||
" \"issuetype_name\": \"$fields.issuetype.name\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" # 分组,把所有Issue类型名放入一个集合\n",
|
||||
" {\n",
|
||||
" \"$group\": {\n",
|
||||
" \"_id\": None,\n",
|
||||
" \"issuetype_names\": {\"$addToSet\": \"$issuetype_name\"},\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" unique_final_issuetypes = (\n",
|
||||
" set(final_types_query[0][\"issuetype_names\"])\n",
|
||||
" if final_types_query != []\n",
|
||||
" else set()\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # 查询在历史中使用过的Issue类型\n",
|
||||
" histories_collection = db[eco_name + \"Histories\"]\n",
|
||||
" history_types_query = list(\n",
|
||||
" histories_collection.aggregate(\n",
|
||||
" [\n",
|
||||
" # 展开'$history.items'数组\n",
|
||||
" {\"$unwind\": \"$history.items\"},\n",
|
||||
" # 筛选更改项item的域为'issuetype'的文档\n",
|
||||
" {\"$match\": {\"history.items.field\": \"issuetype\"}},\n",
|
||||
" # 取出item的'fromString',即更改前的Issue类型\n",
|
||||
" # !注意:更改后的Issue类型'toString'会在下一次更改中作为更改前的值\n",
|
||||
" {\n",
|
||||
" \"$project\": {\n",
|
||||
" \"_id\": 0,\n",
|
||||
" \"issuetype_name\": \"$history.items.fromString\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" # 分组,把所有Issue类型名放入一个集合\n",
|
||||
" {\n",
|
||||
" \"$group\": {\n",
|
||||
" \"_id\": None,\n",
|
||||
" \"issuetype_names\": {\"$addToSet\": \"$issuetype_name\"},\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" unique_history_issuetypes = (\n",
|
||||
" set(history_types_query[0][\"issuetype_names\"])\n",
|
||||
" if history_types_query != []\n",
|
||||
" else set()\n",
|
||||
" )\n",
|
||||
" # 合并两个集合\n",
|
||||
" return len(set.union(unique_final_issuetypes, unique_history_issuetypes))\n",
|
||||
"\n",
|
||||
" def extract_number_of_links(eco_name):\n",
|
||||
" issues_collection = db[eco_name]\n",
|
||||
" # 查询链接总数\n",
|
||||
" links_query = list(\n",
|
||||
" issues_collection.aggregate(\n",
|
||||
" [\n",
|
||||
" # 筛选'$fields.issuelinks'字段非空的文档\n",
|
||||
" {\"$match\": {\"fields.issuelinks\": {\"$exists\": True, \"$ne\": []}}},\n",
|
||||
" # 取出issuelink的id字段(数组)\n",
|
||||
" {\n",
|
||||
" \"$project\": {\n",
|
||||
" \"_id\": 0,\n",
|
||||
" \"issuelink_ids_issue\": \"$fields.issuelinks.id\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" # 把id字段数组展开\n",
|
||||
" {\"$unwind\": \"$issuelink_ids_issue\"},\n",
|
||||
" # 统计链接的id\n",
|
||||
" {\n",
|
||||
" \"$group\": {\n",
|
||||
" \"_id\": None,\n",
|
||||
" \"issuelink_unique_ids\": {\n",
|
||||
" \"$addToSet\": \"$issuelink_ids_issue\"\n",
|
||||
" },\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" num_issuelinks = (\n",
|
||||
" len(set(links_query[0][\"issuelink_unique_ids\"])) if links_query != [] else 0\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # 查询subtask链接总数\n",
|
||||
" subtasks_query = list(\n",
|
||||
" issues_collection.aggregate(\n",
|
||||
" [\n",
|
||||
" # 筛选'$fields.subtasks'字段非空的文档\n",
|
||||
" {\"$match\": {\"fields.subtasks\": {\"$exists\": True, \"$ne\": []}}},\n",
|
||||
" # 计算Issue的subtask数量\n",
|
||||
" {\n",
|
||||
" \"$project\": {\n",
|
||||
" \"_id\": 0,\n",
|
||||
" \"num_issue_subtasks\": {\"$size\": \"$fields.subtasks\"},\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" # 计算整个集合内subtask数量\n",
|
||||
" {\n",
|
||||
" \"$group\": {\n",
|
||||
" \"_id\": None,\n",
|
||||
" \"num_subtasks\": {\"$sum\": \"$num_issue_subtasks\"},\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" num_subtasks = subtasks_query[0][\"num_subtasks\"] if subtasks_query != [] else 0\n",
|
||||
"\n",
|
||||
" # 查询epic链接总数\n",
|
||||
" # epic链接字段是自定义的\n",
|
||||
" EPICLINK_FIELD_DICT = {\n",
|
||||
" \"Apache\": \"customfield_12311120\",\n",
|
||||
" \"Jira\": \"customfield_12931\",\n",
|
||||
" \"Mojang\": \"customfield_11602\",\n",
|
||||
" \"MongoDB\": \"customfield_10857\",\n",
|
||||
" \"Qt\": \"customfield_10400\",\n",
|
||||
" \"RedHat\": \"customfield_12311140\",\n",
|
||||
" }\n",
|
||||
" epiclinks_query = list(\n",
|
||||
" issues_collection.aggregate(\n",
|
||||
" [\n",
|
||||
" # 把自定义epic链接字段统一重命名为'epiclink_field'\n",
|
||||
" {\n",
|
||||
" \"$project\": {\n",
|
||||
" \"epiclink_field\": f\"$fields.{EPICLINK_FIELD_DICT[eco_name]}\"\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" # 筛选epiclink字段非空的文档\n",
|
||||
" # !注意:epic链接是由子Issue指向父Issue的\n",
|
||||
" {\"$match\": {\"epiclink_field\": {\"$exists\": True, \"$ne\": None}}},\n",
|
||||
" # 统计聚合的文档总数\n",
|
||||
" {\"$count\": \"num_epiclinks\"},\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" num_epiclinks = (\n",
|
||||
" epiclinks_query[0][\"num_epiclinks\"] if epiclinks_query != [] else 0\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" return sum([num_issuelinks, num_subtasks, num_epiclinks])\n",
|
||||
"\n",
|
||||
" def extract_number_of_doc_linktypes(eco_name):\n",
|
||||
" # 查询记录的链接类型数\n",
|
||||
" return len(eco_link_types[eco_name])\n",
|
||||
"\n",
|
||||
" def extract_number_of_used_linktypes(eco_name):\n",
|
||||
" issues_collection = db[eco_name]\n",
|
||||
" # 查询在最后状态下的链接类型\n",
|
||||
" final_linktypes_query = list(\n",
|
||||
" issues_collection.aggregate(\n",
|
||||
" [\n",
|
||||
" # 展开issuelinks数组\n",
|
||||
" {\"$unwind\": \"$fields.issuelinks\"},\n",
|
||||
" # 选择链接类型名字段\n",
|
||||
" {\n",
|
||||
" \"$project\": {\n",
|
||||
" \"_id\": 0,\n",
|
||||
" \"linktype_name\": \"$fields.issuelinks.type.name\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" # 分组,把所有链接类型名加入集合\n",
|
||||
" {\n",
|
||||
" \"$group\": {\n",
|
||||
" \"_id\": None,\n",
|
||||
" \"linktype_names\": {\"$addToSet\": \"$linktype_name\"},\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" return (\n",
|
||||
" len(set(final_linktypes_query[0][\"linktype_names\"]))\n",
|
||||
" if final_linktypes_query != []\n",
|
||||
" else 0\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" def extract_born(eco_name):\n",
|
||||
" issues_collection = db[eco_name]\n",
|
||||
" # 取出最初的N个Issue创建时间,检查生态的最早创建时间\n",
|
||||
" created_dates = [\n",
|
||||
" issue[\"fields\"][\"created\"]\n",
|
||||
" for issue in issues_collection.aggregate(\n",
|
||||
" [\n",
|
||||
" # 取出Issue创建时间\n",
|
||||
" {\"$project\": {\"_id\": 0, \"fields.created\": 1}},\n",
|
||||
" # 按创建时间升序排列\n",
|
||||
" {\"$sort\": {\"fields.created\": 1}},\n",
|
||||
" # 实际中,有些Issue会损坏或者是测试Issue,所以需要手动检查创建时间\n",
|
||||
" {\"$limit\": 500},\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
" ]\n",
|
||||
" # 手动检查创建时间,把损坏的或测试Issue的创建时间略过\n",
|
||||
" if eco_name == \"Apache\":\n",
|
||||
" created_dates = created_dates[289:]\n",
|
||||
" elif eco_name == \"Jira\":\n",
|
||||
" created_dates = created_dates[1:]\n",
|
||||
" elif eco_name == \"Qt\":\n",
|
||||
" created_dates = created_dates[7:]\n",
|
||||
"\n",
|
||||
" return float(created_dates[0][:4])\n",
|
||||
"\n",
|
||||
" def extract_number_of_changes(eco_name):\n",
|
||||
" # 查询更改总数\n",
|
||||
" histories_collection = db[eco_name + \"Histories\"]\n",
|
||||
" changes_query = list(\n",
|
||||
" histories_collection.aggregate(\n",
|
||||
" [\n",
|
||||
" # 取出更改对应的域(数组)\n",
|
||||
" {\"$project\": {\"_id\": 0, \"history.items.field\": 1}},\n",
|
||||
" # 把更改数组展开\n",
|
||||
" {\"$unwind\": \"$history.items\"},\n",
|
||||
" # 统计更改总数\n",
|
||||
" {\"$count\": \"num_changes\"},\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" return changes_query[0][\"num_changes\"] if changes_query != [] else 0\n",
|
||||
"\n",
|
||||
" def extract_number_of_unique_projects(eco_name):\n",
|
||||
" # 查询在最后状态下的项目名\n",
|
||||
" issues_collection = db[eco_name]\n",
|
||||
" final_projects_query = list(\n",
|
||||
" issues_collection.aggregate(\n",
|
||||
" [\n",
|
||||
" {\"$project\": {\"_id\": 0, \"project_name\": \"$fields.project.name\"}},\n",
|
||||
" {\n",
|
||||
" \"$group\": {\n",
|
||||
" \"_id\": None,\n",
|
||||
" \"project_names\": {\"$addToSet\": \"$project_name\"},\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" unique_final_projects = (\n",
|
||||
" set(final_projects_query[0][\"project_names\"])\n",
|
||||
" if final_projects_query != []\n",
|
||||
" else set()\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # 查询在历史中的项目名\n",
|
||||
" histories_collection = db[eco_name + \"Histories\"]\n",
|
||||
" history_projects_query = list(\n",
|
||||
" histories_collection.aggregate(\n",
|
||||
" [\n",
|
||||
" {\"$unwind\": \"$history.items\"},\n",
|
||||
" {\n",
|
||||
" \"$match\": {\n",
|
||||
" \"history.items.field\": {\"$in\": [\"project\", \"Project\"]}\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"$project\": {\n",
|
||||
" \"_id\": 0,\n",
|
||||
" \"project_name\": \"$history.items.fromString\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"$group\": {\n",
|
||||
" \"_id\": None,\n",
|
||||
" \"project_names\": {\"$addToSet\": \"$project_name\"},\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" unique_history_projects = (\n",
|
||||
" set(history_projects_query[0][\"project_names\"])\n",
|
||||
" if history_projects_query != []\n",
|
||||
" else set()\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" return len(set.union(unique_final_projects, unique_history_projects))\n",
|
||||
"\n",
|
||||
" def extract_number_of_comments(eco_name):\n",
|
||||
" # 查询评论总数\n",
|
||||
" comments_collection = db[eco_name + \"Comments\"]\n",
|
||||
" num_comments = comments_collection.count_documents({})\n",
|
||||
" return num_comments\n",
|
||||
"\n",
|
||||
" start_time = time() # 记录总处理时间\n",
|
||||
" for eco_name in eco_names:\n",
|
||||
" eco_start_time = time() # 记录处理一个ecosystem的时间\n",
|
||||
" print(f\"Working on ecosystem: {eco_name} ...\")\n",
|
||||
"\n",
|
||||
" # Issue总数及类型\n",
|
||||
" ecos_df.loc[eco_name, \"Issues\"] = extract_number_of_issues(eco_name)\n",
|
||||
" ecos_df.loc[eco_name, \"DIT\"] = extract_number_of_doc_issuetypes(eco_name)\n",
|
||||
" ecos_df.loc[eco_name, \"UIT\"] = extract_number_of_used_issuetypes(eco_name)\n",
|
||||
"\n",
|
||||
" # Link总数及类型\n",
|
||||
" ecos_df.loc[eco_name, \"Links\"] = extract_number_of_links(eco_name)\n",
|
||||
" ecos_df.loc[eco_name, \"DLT\"] = extract_number_of_doc_linktypes(eco_name)\n",
|
||||
" ecos_df.loc[eco_name, \"ULT\"] = extract_number_of_used_linktypes(eco_name)\n",
|
||||
"\n",
|
||||
" # 其他信息\n",
|
||||
" ecos_df.loc[eco_name, \"Year\"] = extract_born(eco_name)\n",
|
||||
" ecos_df.loc[eco_name, \"Changes\"] = extract_number_of_changes(eco_name)\n",
|
||||
" ecos_df.loc[eco_name, \"Ch/I\"] = round(\n",
|
||||
" ecos_df.loc[eco_name, \"Changes\"] / ecos_df.loc[eco_name, \"Issues\"]\n",
|
||||
" )\n",
|
||||
" ecos_df.loc[eco_name, \"UP\"] = extract_number_of_unique_projects(eco_name)\n",
|
||||
" ecos_df.loc[eco_name, \"Comments\"] = extract_number_of_comments(eco_name)\n",
|
||||
" ecos_df.loc[eco_name, \"Co/I\"] = round(\n",
|
||||
" ecos_df.loc[eco_name, \"Comments\"] / ecos_df.loc[eco_name, \"Issues\"]\n",
|
||||
" )\n",
|
||||
" print(\n",
|
||||
" f\"✔ {eco_name} completely processed. Duration: {format_duration(eco_start_time, time())}\"\n",
|
||||
" )\n",
|
||||
" print(\"\")\n",
|
||||
"\n",
|
||||
" print(\n",
|
||||
" f\"✅ All completely processed. Total duration: {format_duration(start_time, time())}\"\n",
|
||||
" )\n",
|
||||
" return ecos_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "1698189d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Working on ecosystem: RedHat ...\n",
|
||||
"✔ RedHat completely processed. Duration: 00:00:45\n",
|
||||
"\n",
|
||||
"Working on ecosystem: Qt ...\n",
|
||||
"✔ Qt completely processed. Duration: 00:00:11\n",
|
||||
"\n",
|
||||
"✅ All completely processed. Total duration: 00:00:56\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ecos_df = populate_ecos_df(\n",
|
||||
" ecos_df,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "fc015a43-fd77-46bd-8712-b65476b36d3c",
|
||||
"metadata": {
|
||||
"execution": {
|
||||
"iopub.execute_input": "2022-01-25T11:06:33.006871Z",
|
||||
"iopub.status.busy": "2022-01-25T11:06:33.006437Z",
|
||||
"iopub.status.idle": "2022-01-25T11:06:33.162697Z",
|
||||
"shell.execute_reply": "2022-01-25T11:06:33.161886Z",
|
||||
"shell.execute_reply.started": "2022-01-25T11:06:33.006832Z"
|
||||
},
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def display_ecos_df(ecos_df):\n",
|
||||
"\n",
|
||||
" # 计算各列总和、均值、中值以及标准差\n",
|
||||
" for header in ecos_df.columns:\n",
|
||||
" if header in [\"Year\"]:\n",
|
||||
" continue\n",
|
||||
" ecos_df.loc[\"Sum\", header] = sum(ecos_df[header][: len(ALL_ECOS)])\n",
|
||||
" ecos_df.loc[\"Mean\", header] = mean(ecos_df[header][: len(ALL_ECOS)])\n",
|
||||
" ecos_df.loc[\"Median\", header] = median(ecos_df[header][: len(ALL_ECOS)])\n",
|
||||
" ecos_df.loc[\"Std Dev\", header] = np.std(ecos_df[header][: len(ALL_ECOS)])\n",
|
||||
"\n",
|
||||
" # 格式化某些列的值\n",
|
||||
" comma_separated_columns = {\n",
|
||||
" col_name: \"{:,.0f}\" for col_name in [\"Issues\", \"Links\", \"Changes\", \"Comments\"]\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" # 展示DataFrame\n",
|
||||
" display(\n",
|
||||
" ecos_df.style.set_table_styles(\n",
|
||||
" [dict(selector=\"th\", props=[(\"text-align\", \"left\")])]\n",
|
||||
" ).format(comma_separated_columns, precision=0)\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "7dfad2f7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<style type=\"text/css\">\n",
|
||||
"#T_9db15 th {\n",
|
||||
" text-align: left;\n",
|
||||
"}\n",
|
||||
"</style>\n",
|
||||
"<table id=\"T_9db15\">\n",
|
||||
" <thead>\n",
|
||||
" <tr>\n",
|
||||
" <th class=\"blank level0\" > </th>\n",
|
||||
" <th id=\"T_9db15_level0_col0\" class=\"col_heading level0 col0\" >Year</th>\n",
|
||||
" <th id=\"T_9db15_level0_col1\" class=\"col_heading level0 col1\" >Issues</th>\n",
|
||||
" <th id=\"T_9db15_level0_col2\" class=\"col_heading level0 col2\" >DIT</th>\n",
|
||||
" <th id=\"T_9db15_level0_col3\" class=\"col_heading level0 col3\" >UIT</th>\n",
|
||||
" <th id=\"T_9db15_level0_col4\" class=\"col_heading level0 col4\" >Links</th>\n",
|
||||
" <th id=\"T_9db15_level0_col5\" class=\"col_heading level0 col5\" >DLT</th>\n",
|
||||
" <th id=\"T_9db15_level0_col6\" class=\"col_heading level0 col6\" >ULT</th>\n",
|
||||
" <th id=\"T_9db15_level0_col7\" class=\"col_heading level0 col7\" >UP</th>\n",
|
||||
" <th id=\"T_9db15_level0_col8\" class=\"col_heading level0 col8\" >Changes</th>\n",
|
||||
" <th id=\"T_9db15_level0_col9\" class=\"col_heading level0 col9\" >Ch/I</th>\n",
|
||||
" <th id=\"T_9db15_level0_col10\" class=\"col_heading level0 col10\" >Comments</th>\n",
|
||||
" <th id=\"T_9db15_level0_col11\" class=\"col_heading level0 col11\" >Co/I</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th id=\"T_9db15_level0_row0\" class=\"row_heading level0 row0\" >RedHat</th>\n",
|
||||
" <td id=\"T_9db15_row0_col0\" class=\"data row0 col0\" >2001</td>\n",
|
||||
" <td id=\"T_9db15_row0_col1\" class=\"data row0 col1\" >502,297</td>\n",
|
||||
" <td id=\"T_9db15_row0_col2\" class=\"data row0 col2\" >79</td>\n",
|
||||
" <td id=\"T_9db15_row0_col3\" class=\"data row0 col3\" >64</td>\n",
|
||||
" <td id=\"T_9db15_row0_col4\" class=\"data row0 col4\" >268,935</td>\n",
|
||||
" <td id=\"T_9db15_row0_col5\" class=\"data row0 col5\" >11</td>\n",
|
||||
" <td id=\"T_9db15_row0_col6\" class=\"data row0 col6\" >11</td>\n",
|
||||
" <td id=\"T_9db15_row0_col7\" class=\"data row0 col7\" >807</td>\n",
|
||||
" <td id=\"T_9db15_row0_col8\" class=\"data row0 col8\" >7,197,717</td>\n",
|
||||
" <td id=\"T_9db15_row0_col9\" class=\"data row0 col9\" >14</td>\n",
|
||||
" <td id=\"T_9db15_row0_col10\" class=\"data row0 col10\" >1,115,471</td>\n",
|
||||
" <td id=\"T_9db15_row0_col11\" class=\"data row0 col11\" >2</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th id=\"T_9db15_level0_row1\" class=\"row_heading level0 row1\" >Qt</th>\n",
|
||||
" <td id=\"T_9db15_row1_col0\" class=\"data row1 col0\" >2005</td>\n",
|
||||
" <td id=\"T_9db15_row1_col1\" class=\"data row1 col1\" >180,574</td>\n",
|
||||
" <td id=\"T_9db15_row1_col2\" class=\"data row1 col2\" >15</td>\n",
|
||||
" <td id=\"T_9db15_row1_col3\" class=\"data row1 col3\" >15</td>\n",
|
||||
" <td id=\"T_9db15_row1_col4\" class=\"data row1 col4\" >58,621</td>\n",
|
||||
" <td id=\"T_9db15_row1_col5\" class=\"data row1 col5\" >10</td>\n",
|
||||
" <td id=\"T_9db15_row1_col6\" class=\"data row1 col6\" >10</td>\n",
|
||||
" <td id=\"T_9db15_row1_col7\" class=\"data row1 col7\" >60</td>\n",
|
||||
" <td id=\"T_9db15_row1_col8\" class=\"data row1 col8\" >2,307,707</td>\n",
|
||||
" <td id=\"T_9db15_row1_col9\" class=\"data row1 col9\" >13</td>\n",
|
||||
" <td id=\"T_9db15_row1_col10\" class=\"data row1 col10\" >507,214</td>\n",
|
||||
" <td id=\"T_9db15_row1_col11\" class=\"data row1 col11\" >3</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th id=\"T_9db15_level0_row2\" class=\"row_heading level0 row2\" >Sum</th>\n",
|
||||
" <td id=\"T_9db15_row2_col0\" class=\"data row2 col0\" >nan</td>\n",
|
||||
" <td id=\"T_9db15_row2_col1\" class=\"data row2 col1\" >682,871</td>\n",
|
||||
" <td id=\"T_9db15_row2_col2\" class=\"data row2 col2\" >94</td>\n",
|
||||
" <td id=\"T_9db15_row2_col3\" class=\"data row2 col3\" >79</td>\n",
|
||||
" <td id=\"T_9db15_row2_col4\" class=\"data row2 col4\" >327,556</td>\n",
|
||||
" <td id=\"T_9db15_row2_col5\" class=\"data row2 col5\" >21</td>\n",
|
||||
" <td id=\"T_9db15_row2_col6\" class=\"data row2 col6\" >21</td>\n",
|
||||
" <td id=\"T_9db15_row2_col7\" class=\"data row2 col7\" >867</td>\n",
|
||||
" <td id=\"T_9db15_row2_col8\" class=\"data row2 col8\" >9,505,424</td>\n",
|
||||
" <td id=\"T_9db15_row2_col9\" class=\"data row2 col9\" >27</td>\n",
|
||||
" <td id=\"T_9db15_row2_col10\" class=\"data row2 col10\" >1,622,685</td>\n",
|
||||
" <td id=\"T_9db15_row2_col11\" class=\"data row2 col11\" >5</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th id=\"T_9db15_level0_row3\" class=\"row_heading level0 row3\" >Mean</th>\n",
|
||||
" <td id=\"T_9db15_row3_col0\" class=\"data row3 col0\" >nan</td>\n",
|
||||
" <td id=\"T_9db15_row3_col1\" class=\"data row3 col1\" >341,436</td>\n",
|
||||
" <td id=\"T_9db15_row3_col2\" class=\"data row3 col2\" >47</td>\n",
|
||||
" <td id=\"T_9db15_row3_col3\" class=\"data row3 col3\" >40</td>\n",
|
||||
" <td id=\"T_9db15_row3_col4\" class=\"data row3 col4\" >163,778</td>\n",
|
||||
" <td id=\"T_9db15_row3_col5\" class=\"data row3 col5\" >10</td>\n",
|
||||
" <td id=\"T_9db15_row3_col6\" class=\"data row3 col6\" >10</td>\n",
|
||||
" <td id=\"T_9db15_row3_col7\" class=\"data row3 col7\" >434</td>\n",
|
||||
" <td id=\"T_9db15_row3_col8\" class=\"data row3 col8\" >4,752,712</td>\n",
|
||||
" <td id=\"T_9db15_row3_col9\" class=\"data row3 col9\" >14</td>\n",
|
||||
" <td id=\"T_9db15_row3_col10\" class=\"data row3 col10\" >811,342</td>\n",
|
||||
" <td id=\"T_9db15_row3_col11\" class=\"data row3 col11\" >2</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th id=\"T_9db15_level0_row4\" class=\"row_heading level0 row4\" >Median</th>\n",
|
||||
" <td id=\"T_9db15_row4_col0\" class=\"data row4 col0\" >nan</td>\n",
|
||||
" <td id=\"T_9db15_row4_col1\" class=\"data row4 col1\" >341,436</td>\n",
|
||||
" <td id=\"T_9db15_row4_col2\" class=\"data row4 col2\" >47</td>\n",
|
||||
" <td id=\"T_9db15_row4_col3\" class=\"data row4 col3\" >40</td>\n",
|
||||
" <td id=\"T_9db15_row4_col4\" class=\"data row4 col4\" >163,778</td>\n",
|
||||
" <td id=\"T_9db15_row4_col5\" class=\"data row4 col5\" >10</td>\n",
|
||||
" <td id=\"T_9db15_row4_col6\" class=\"data row4 col6\" >10</td>\n",
|
||||
" <td id=\"T_9db15_row4_col7\" class=\"data row4 col7\" >434</td>\n",
|
||||
" <td id=\"T_9db15_row4_col8\" class=\"data row4 col8\" >4,752,712</td>\n",
|
||||
" <td id=\"T_9db15_row4_col9\" class=\"data row4 col9\" >14</td>\n",
|
||||
" <td id=\"T_9db15_row4_col10\" class=\"data row4 col10\" >811,342</td>\n",
|
||||
" <td id=\"T_9db15_row4_col11\" class=\"data row4 col11\" >2</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th id=\"T_9db15_level0_row5\" class=\"row_heading level0 row5\" >Std Dev</th>\n",
|
||||
" <td id=\"T_9db15_row5_col0\" class=\"data row5 col0\" >nan</td>\n",
|
||||
" <td id=\"T_9db15_row5_col1\" class=\"data row5 col1\" >160,862</td>\n",
|
||||
" <td id=\"T_9db15_row5_col2\" class=\"data row5 col2\" >32</td>\n",
|
||||
" <td id=\"T_9db15_row5_col3\" class=\"data row5 col3\" >24</td>\n",
|
||||
" <td id=\"T_9db15_row5_col4\" class=\"data row5 col4\" >105,157</td>\n",
|
||||
" <td id=\"T_9db15_row5_col5\" class=\"data row5 col5\" >0</td>\n",
|
||||
" <td id=\"T_9db15_row5_col6\" class=\"data row5 col6\" >0</td>\n",
|
||||
" <td id=\"T_9db15_row5_col7\" class=\"data row5 col7\" >374</td>\n",
|
||||
" <td id=\"T_9db15_row5_col8\" class=\"data row5 col8\" >2,445,005</td>\n",
|
||||
" <td id=\"T_9db15_row5_col9\" class=\"data row5 col9\" >0</td>\n",
|
||||
" <td id=\"T_9db15_row5_col10\" class=\"data row5 col10\" >304,128</td>\n",
|
||||
" <td id=\"T_9db15_row5_col11\" class=\"data row5 col11\" >0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"<pandas.io.formats.style.Styler at 0x7f2c251df1f0>"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"display_ecos_df(ecos_df)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "63163049",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ecos_df.to_csv(\"../data/ecos_overview.csv\", sep=\";\", encoding=\"utf-8\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.18"
|
||||
},
|
||||
"toc-autonumbering": false,
|
||||
"toc-showcode": false,
|
||||
"toc-showmarkdowntxt": false,
|
||||
"toc-showtags": false
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
ecosystem,scope,max,min,median,mean,std
|
||||
RedHat,without,421,0,1.0,2.1489339812279398,3.5470614635577187
|
||||
RedHat,with,117,0,1.0,2.2934478185438794,3.9874327658983035
|
||||
RedHat,in,117,0,1.0,2.268535068928034,4.001612675816817
|
||||
RedHat,cross,117,0,2.0,3.1102165516976346,4.951732355023365
|
|
|
@ -0,0 +1,5 @@
|
|||
ecosystem,scope,max,min,median,mean,std
|
||||
RedHat,without,421,0,1.0,1.8907019826905265,3.290788730118782
|
||||
RedHat,with,117,0,2.0,2.993302339192955,4.6213686922437915
|
||||
RedHat,in,117,0,2.0,3.0957131121431085,4.758508197318559
|
||||
RedHat,cross,117,0,2.0,3.1515352998065764,4.9921727596311385
|
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,2 @@
|
|||
ecosystem,#issues,#issues_with_links,#links,%issues_with_links,#max_links,#min_links,#median_links,#mean_links,#link_types,#projects,#links_cross_project,%links_cross_project
|
||||
RedHat,502297,249581,238053,49.69,399,1,1.0,1.9076211730860924,13,279,34866,14.65
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,5 @@
|
|||
ecosystem,scope,num_issues,num_not_closed,per_not_closed,num_closed,per_closed
|
||||
RedHat,without,252716,80693,31.93,172023,68.07
|
||||
RedHat,with,249581,81104,32.5,168477,67.5
|
||||
RedHat,in,219867,69418,31.57,150449,68.43
|
||||
RedHat,cross,52043,20816,40.0,31227,60.0
|
|
|
@ -0,0 +1,5 @@
|
|||
ecosystem,scope,num_issues,num_not_closed,per_not_closed,num_closed,per_closed
|
||||
RedHat,without,351946,112787,32.05,239159,67.95
|
||||
RedHat,with,150351,49010,32.6,101341,67.4
|
||||
RedHat,in,119294,36870,30.91,82424,69.09
|
||||
RedHat,cross,49632,20120,40.54,29512,59.46
|
|
|
@ -0,0 +1,5 @@
|
|||
ecosystem,scope,max,min,median,mean,std
|
||||
RedHat,without,6412,0,103,460,856
|
||||
RedHat,with,6283,0,113,366,660
|
||||
RedHat,in,6283,0,104,340,639
|
||||
RedHat,cross,5815,0,219,531,736
|
|
|
@ -0,0 +1,5 @@
|
|||
ecosystem,scope,max,min,median,mean,std
|
||||
RedHat,without,6412,0,95,405,785
|
||||
RedHat,with,6283,0,140,432,720
|
||||
RedHat,in,6283,0,124,399,701
|
||||
RedHat,cross,5815,0,230,548,746
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,7 @@
|
|||
ecosystem,scope,interval_type,max,min,median,mean,std
|
||||
RedHat,with,cti,5372,0,24,103,218
|
||||
RedHat,with,lti,5829,0,0,207,729
|
||||
RedHat,in,cti,5372,0,22,99,210
|
||||
RedHat,in,lti,5829,0,0,180,691
|
||||
RedHat,cross,cti,4815,0,33,132,268
|
||||
RedHat,cross,lti,5766,0,0,408,950
|
|
|
@ -0,0 +1,7 @@
|
|||
ecosystem,scope,interval_type,max,min,median,mean,std
|
||||
RedHat,with,cti,5372,0,28,120,256
|
||||
RedHat,with,lti,5829,0,0,399,979
|
||||
RedHat,in,cti,5372,0,28,118,251
|
||||
RedHat,in,lti,5829,0,0,388,978
|
||||
RedHat,cross,cti,4815,0,28,128,271
|
||||
RedHat,cross,lti,5766,0,0,443,984
|
|
Binary file not shown.
|
@ -0,0 +1,14 @@
|
|||
ecosystem,link_type,min,q1,median,q3,max,mean,std
|
||||
RedHat,Account,0,0.0,2.0,5.0,87,3.9,6.5
|
||||
RedHat,Blocks,0,1.0,2.0,4.0,117,3.5,5.5
|
||||
RedHat,Causality,0,0.0,2.0,4.0,100,3.6,5.7
|
||||
RedHat,Cloners,0,0.0,1.0,3.0,100,2.4,3.9
|
||||
RedHat,Depend,0,0.0,1.0,3.0,91,3.0,5.0
|
||||
RedHat,Document,0,0.0,2.0,5.0,63,3.6,5.3
|
||||
RedHat,Duplicate,0,1.0,2.0,4.0,117,3.4,5.2
|
||||
RedHat,Epic,0,0.0,1.0,2.0,80,1.7,3.3
|
||||
RedHat,Incorporates,0,1.0,1.0,3.0,83,2.6,4.0
|
||||
RedHat,Issue split,0,0.0,1.0,3.0,77,2.5,5.0
|
||||
RedHat,Related,0,1.0,2.0,5.0,117,3.9,5.6
|
||||
RedHat,Subtask,0,0.0,1.0,2.0,117,1.6,3.2
|
||||
RedHat,Triggers,0,0.0,2.0,4.0,62,3.4,5.6
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,14 @@
|
|||
ecosystem,link_type,number,percentage,num_in,per_in,num_cross,per_cross
|
||||
RedHat,Epic,67799,28.48,65360,96.4,2439,3.6
|
||||
RedHat,Subtask,45020,18.91,45013,99.98,7,0.02
|
||||
RedHat,Related,44222,18.58,34481,77.97,9741,22.03
|
||||
RedHat,Cloners,29629,12.45,19142,64.61,10487,35.39
|
||||
RedHat,Blocks,21106,8.87,16367,77.55,4739,22.45
|
||||
RedHat,Incorporates,12847,5.4,9154,71.25,3693,28.75
|
||||
RedHat,Duplicate,7080,2.97,6414,90.59,666,9.41
|
||||
RedHat,Causality,4122,1.73,2714,65.84,1408,34.16
|
||||
RedHat,Depend,2849,1.2,2311,81.12,538,18.88
|
||||
RedHat,Document,1652,0.69,811,49.09,841,50.91
|
||||
RedHat,Issue split,694,0.29,665,95.82,29,4.18
|
||||
RedHat,Account,568,0.24,462,81.34,106,18.66
|
||||
RedHat,Triggers,465,0.2,293,63.01,172,36.99
|
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,14 @@
|
|||
ecosystem,link_type,num_issues,num_not_closed,per_not_closed,num_closed,per_closed
|
||||
RedHat,Account,886,246,27.77,640,72.23
|
||||
RedHat,Blocks,29857,9117,30.54,20740,69.46
|
||||
RedHat,Causality,6920,2563,37.04,4357,62.96
|
||||
RedHat,Cloners,48828,18147,37.17,30681,62.83
|
||||
RedHat,Depend,4247,1647,38.78,2600,61.22
|
||||
RedHat,Document,2795,672,24.04,2123,75.96
|
||||
RedHat,Duplicate,13104,2646,20.19,10458,79.81
|
||||
RedHat,Epic,77019,26516,34.43,50503,65.57
|
||||
RedHat,Incorporates,16505,8090,49.02,8415,50.98
|
||||
RedHat,Issue split,1071,467,43.6,604,56.4
|
||||
RedHat,Related,63125,19934,31.58,43191,68.42
|
||||
RedHat,Subtask,55986,15859,28.33,40127,71.67
|
||||
RedHat,Triggers,779,284,36.46,495,63.54
|
|
|
@ -0,0 +1,14 @@
|
|||
ecosystem,link_type,min,q1,median,q3,max,mean,std
|
||||
RedHat,Account,0.0,16.9,62.9,169.7,1189.9,129.8,169.4
|
||||
RedHat,Blocks,0.0,48.6,174.1,552.9,6283.1,468.0,722.8
|
||||
RedHat,Causality,0.0,40.2,167.9,608.2,3832.2,440.4,587.5
|
||||
RedHat,Cloners,0.0,29.1,126.6,381.8,4714.3,342.2,532.4
|
||||
RedHat,Depend,0.0,23.1,68.1,155.0,2052.2,115.4,145.9
|
||||
RedHat,Document,0.0,55.0,132.9,326.5,3792.5,253.9,337.0
|
||||
RedHat,Duplicate,0.0,16.2,104.0,444.4,5946.3,434.5,804.4
|
||||
RedHat,Epic,0.0,27.8,99.3,294.9,4253.5,233.3,329.4
|
||||
RedHat,Incorporates,0.0,80.1,213.4,575.2,5900.5,486.7,684.8
|
||||
RedHat,Issue split,0.0,21.0,52.9,156.3,1173.3,118.5,161.8
|
||||
RedHat,Related,0.0,45.1,173.7,594.8,6111.5,511.0,814.9
|
||||
RedHat,Subtask,0.0,20.0,77.0,295.0,5899.6,325.7,702.4
|
||||
RedHat,Triggers,0.0,18.3,63.0,197.0,1487.8,152.2,212.5
|
|
Binary file not shown.
|
@ -0,0 +1,27 @@
|
|||
ecosystem,link_type,interval_type,min,q1,median,q3,max,mean,std
|
||||
RedHat,Account,cti,0.0,7.5,28.3,85.4,2004.1,87.4,191.1
|
||||
RedHat,Account,lti,0.0,0.0,0.0,3.4,371.7,12.3,41.8
|
||||
RedHat,Blocks,cti,0.0,1.1,20.8,91.1,4020.7,93.5,199.4
|
||||
RedHat,Blocks,lti,0.0,2.1,1119.1,2767.7,5829.9,1541.8,1622.6
|
||||
RedHat,Causality,cti,0.0,7.3,43.9,149.4,3948.3,155.4,309.5
|
||||
RedHat,Causality,lti,0.0,0.0,359.4,1160.0,2693.0,600.7,652.4
|
||||
RedHat,Cloners,cti,0.0,0.0,13.4,71.2,2884.2,71.5,171.0
|
||||
RedHat,Cloners,lti,0.0,0.0,0.0,0.0,5546.1,137.0,661.2
|
||||
RedHat,Depend,cti,0.0,0.0,17.1,86.0,2884.0,79.8,180.4
|
||||
RedHat,Depend,lti,0.0,0.0,0.0,4.4,874.4,13.9,46.8
|
||||
RedHat,Document,cti,0.0,13.0,53.7,121.0,2700.0,119.3,221.6
|
||||
RedHat,Document,lti,0.0,0.0,0.0,5.8,997.2,17.3,62.6
|
||||
RedHat,Duplicate,cti,0.0,5.7,33.0,140.8,3196.8,138.3,275.9
|
||||
RedHat,Duplicate,lti,0.0,0.2,3.9,34.2,2638.1,50.5,144.9
|
||||
RedHat,Epic,cti,0.0,1.5,39.4,129.4,4159.7,103.3,169.9
|
||||
RedHat,Epic,lti,0.0,0.0,0.0,0.0,1444.8,6.4,35.3
|
||||
RedHat,Incorporates,cti,0.0,5.1,26.8,96.3,4179.5,106.5,238.7
|
||||
RedHat,Incorporates,lti,0.0,0.0,0.0,6.9,3161.8,19.8,95.8
|
||||
RedHat,Issue split,cti,0.0,14.0,28.0,92.8,2128.7,87.2,163.3
|
||||
RedHat,Issue split,lti,0.0,0.0,0.0,107.6,1163.8,102.2,200.3
|
||||
RedHat,Related,cti,0.0,7.4,46.8,165.6,5372.6,159.9,309.3
|
||||
RedHat,Related,lti,0.0,0.0,0.1,36.2,3677.8,181.5,478.4
|
||||
RedHat,Subtask,cti,0.0,0.0,1.6,46.2,3593.7,60.1,159.3
|
||||
RedHat,Subtask,lti,0.0,0.0,0.0,0.0,3677.8,1.3,27.4
|
||||
RedHat,Triggers,cti,0.0,11.3,32.2,124.0,1173.6,100.4,167.2
|
||||
RedHat,Triggers,lti,0.0,0.0,0.0,0.2,2599.0,17.6,134.1
|
|
|
@ -0,0 +1,360 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import csv\n",
|
||||
"from time import time\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"from pathlib import Path\n",
|
||||
"from pymongo import MongoClient\n",
|
||||
"from pymongo.database import Database"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# 软件生态名\n",
|
||||
"ECO_NAMES = [\n",
|
||||
" # \"Apache\",\n",
|
||||
" # \"Jira\",\n",
|
||||
" # \"Mojang\",\n",
|
||||
" # \"MongoDB\",\n",
|
||||
" # \"Qt\",\n",
|
||||
" \"RedHat\",\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# 保存Issue和链接原始数据的目录\n",
|
||||
"ISSUE_DIR = Path(\"../data/raw/issues\")\n",
|
||||
"ISSUE_DIR.mkdir(parents=True, exist_ok=True)\n",
|
||||
"LINK_DIR = Path(\"../data/raw/links\")\n",
|
||||
"LINK_DIR.mkdir(parents=True, exist_ok=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# 格式化时间间隔\n",
|
||||
"def format_duration(start_time, end_time):\n",
|
||||
" # 计算总秒数\n",
|
||||
" seconds = end_time - start_time\n",
|
||||
" # 计算分钟和小时数\n",
|
||||
" minutes = int(seconds / 60)\n",
|
||||
" hours = int(minutes / 60)\n",
|
||||
" display_minutes = int(minutes % 60)\n",
|
||||
" display_seconds = int(seconds % 60)\n",
|
||||
"\n",
|
||||
" return f\"{hours:02}:{display_minutes:02}:{display_seconds:02}\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def extract_issues_to_csv(eco_name: str, db: Database):\n",
|
||||
" # 提取生态中的Issue字段数据,保存至csv文件\n",
|
||||
" issues_collection = db[eco_name]\n",
|
||||
" # 迭代访问所有文档\n",
|
||||
" cursor = issues_collection.find({})\n",
|
||||
"\n",
|
||||
" issues = []\n",
|
||||
" # 分别记录提取的Issue和comments总数\n",
|
||||
" num_comments = 0\n",
|
||||
"\n",
|
||||
" for issue in tqdm(cursor):\n",
|
||||
" try:\n",
|
||||
" key = issue[\"key\"] # Issue关键字\n",
|
||||
"\n",
|
||||
" project_key = issue[\"fields\"][\"project\"][\"key\"] # 所属项目关键字\n",
|
||||
"\n",
|
||||
" project_name = issue[\"fields\"][\"project\"][\"name\"] # 所属项目名\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" issue_type = issue[\"fields\"][\"issuetype\"][\"name\"] # Issue类型\n",
|
||||
" except Exception:\n",
|
||||
" issue_type = None\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" status = issue[\"fields\"][\"status\"][\"name\"] # 状态\n",
|
||||
" except Exception:\n",
|
||||
" status = None\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" resolution = issue[\"fields\"][\"resolution\"][\"name\"] # 解决与否\n",
|
||||
" except Exception:\n",
|
||||
" resolution = None\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" created_time = issue[\"fields\"][\"created\"] # 创建时间\n",
|
||||
" except Exception:\n",
|
||||
" created_time = None\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" priority = issue[\"fields\"][\"priority\"][\"name\"] # 优先级\n",
|
||||
" except Exception:\n",
|
||||
" priority = None\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" title = issue[\"fields\"][\"summary\"] # 标题\n",
|
||||
" except Exception:\n",
|
||||
" title = None\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" description = issue[\"fields\"][\"description\"] # 描述\n",
|
||||
" except Exception:\n",
|
||||
" description = None\n",
|
||||
"\n",
|
||||
" num_issue_comments = issue[\"fields\"][\"comment\"][\n",
|
||||
" \"total\"\n",
|
||||
" ] # 该Issue的评论数量\n",
|
||||
"\n",
|
||||
" issue_dict = {\n",
|
||||
" \"key\": key,\n",
|
||||
" \"project_key\": project_key,\n",
|
||||
" \"project_name\": project_name,\n",
|
||||
" \"issue_type\": issue_type,\n",
|
||||
" \"status\": status,\n",
|
||||
" \"resolution\": resolution,\n",
|
||||
" \"created_time\": created_time,\n",
|
||||
" \"priority\": priority,\n",
|
||||
" \"title\": title,\n",
|
||||
" \"description\": description,\n",
|
||||
" \"num_comments\": num_issue_comments,\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" issues.append(issue_dict)\n",
|
||||
" num_comments += num_issue_comments\n",
|
||||
"\n",
|
||||
" except Exception:\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" filename = ISSUE_DIR / (eco_name + \".csv\")\n",
|
||||
" with open(filename, \"w\", errors=\"surrogatepass\", encoding=\"utf-8\") as output_file:\n",
|
||||
" dict_wirter = csv.DictWriter(output_file, issues[0].keys(), delimiter=\";\")\n",
|
||||
" dict_wirter.writeheader()\n",
|
||||
" dict_wirter.writerows(issues)\n",
|
||||
"\n",
|
||||
" print(\n",
|
||||
" f\"✔ Extracted {len(issues)} raw issues with {num_comments} comments from {eco_name}\"\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Epic链接自定义字段\n",
|
||||
"EPICLINK_FIELD_DICT = {\n",
|
||||
" \"Apache\": \"customfield_12311120\",\n",
|
||||
" \"Jira\": \"customfield_12931\",\n",
|
||||
" \"Mojang\": \"customfield_11602\",\n",
|
||||
" \"MongoDB\": \"customfield_10857\",\n",
|
||||
" \"Qt\": \"customfield_10400\",\n",
|
||||
" \"RedHat\": \"customfield_12311140\",\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def extract_links_to_csv(eco_name: str, db: Database):\n",
|
||||
" # 提取生态中的链接数据,保存至csv文件\n",
|
||||
"\n",
|
||||
" issues_collection = db[eco_name]\n",
|
||||
"\n",
|
||||
" links = []\n",
|
||||
" cursor = issues_collection.find({})\n",
|
||||
" for issue in tqdm(cursor):\n",
|
||||
" try:\n",
|
||||
" key = issue[\"key\"]\n",
|
||||
"\n",
|
||||
" # 提取一般类型链接\n",
|
||||
" issuelinks = issue[\"fields\"][\"issuelinks\"]\n",
|
||||
" for issuelink in issuelinks:\n",
|
||||
" link_type = issuelink[\"type\"][\"name\"]\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" in_issue_key = key\n",
|
||||
" out_issue_key = issuelink[\"outwardIssue\"][\"key\"]\n",
|
||||
" except Exception:\n",
|
||||
" in_issue_key = issuelink[\"inwardIssue\"][\"key\"]\n",
|
||||
" out_issue_key = key\n",
|
||||
"\n",
|
||||
" link_key = in_issue_key + \"_\" + out_issue_key # 用作链接的唯一标识\n",
|
||||
"\n",
|
||||
" link_dict = {\n",
|
||||
" \"link_key\": link_key,\n",
|
||||
" \"link_type\": link_type,\n",
|
||||
" \"in_issue_key\": in_issue_key,\n",
|
||||
" \"out_issue_key\": out_issue_key,\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" links.append(link_dict)\n",
|
||||
"\n",
|
||||
" # 提取Subtask链接\n",
|
||||
" subtasks = issue[\"fields\"][\"subtasks\"]\n",
|
||||
" for subtask in subtasks:\n",
|
||||
"\n",
|
||||
" link_type = \"Subtask\"\n",
|
||||
" in_issue_key = key\n",
|
||||
" out_issue_key = subtask[\n",
|
||||
" \"key\"\n",
|
||||
" ] # Subtask类型链接都是由父Issue指向子Issue\n",
|
||||
"\n",
|
||||
" link_key = in_issue_key + \"_\" + out_issue_key\n",
|
||||
"\n",
|
||||
" link_dict = {\n",
|
||||
" \"link_key\": link_key,\n",
|
||||
" \"link_type\": link_type,\n",
|
||||
" \"in_issue_key\": in_issue_key,\n",
|
||||
" \"out_issue_key\": out_issue_key,\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" links.append(link_dict)\n",
|
||||
"\n",
|
||||
" # 提取Epic链接\n",
|
||||
" try:\n",
|
||||
" epic_key = issue[\"fields\"][EPICLINK_FIELD_DICT[eco_name]]\n",
|
||||
" in_issue_key = key\n",
|
||||
" out_issue_key = epic_key\n",
|
||||
" link_key = in_issue_key + \"_\" + out_issue_key\n",
|
||||
" link_type = \"Epic\"\n",
|
||||
"\n",
|
||||
" link_dict = {\n",
|
||||
" \"link_key\": link_key,\n",
|
||||
" \"link_type\": link_type,\n",
|
||||
" \"in_issue_key\": in_issue_key,\n",
|
||||
" \"out_issue_key\": out_issue_key,\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" links.append(link_dict)\n",
|
||||
"\n",
|
||||
" except Exception:\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" except Exception:\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" filename = LINK_DIR / (eco_name + \".csv\")\n",
|
||||
" with open(filename, \"w\", errors=\"surrogatepass\", encoding=\"utf-8\") as output_file:\n",
|
||||
" dict_wirter = csv.DictWriter(output_file, links[0].keys(), delimiter=\";\")\n",
|
||||
" dict_wirter.writeheader()\n",
|
||||
" dict_wirter.writerows(links)\n",
|
||||
"\n",
|
||||
" print(f\"✔ Extracted {len(links)} raw links from {eco_name}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Extracting issues and links data from database...\n",
|
||||
"Working on ecosystem: RedHat ...\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"502297it [00:43, 11442.78it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"✔ Extracted 502297 raw issues with 1115471 comments from RedHat\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"502297it [00:38, 13038.55it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"✔ Extracted 405070 raw links from RedHat\n",
|
||||
"✔ RedHat completely processed. Duration: 00:01:31\n",
|
||||
"\n",
|
||||
"✅ All completely processed. Total duration: 00:01:31\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"with MongoClient() as client:\n",
|
||||
" start_time = time() # 记录总处理时间\n",
|
||||
" db = client[\"JiraEcos\"]\n",
|
||||
" print(\"Extracting issues and links data from database...\")\n",
|
||||
"\n",
|
||||
" for eco_name in ECO_NAMES:\n",
|
||||
" eco_start_time = time() # 记录提取每个生态数据的时间\n",
|
||||
" print(f\"Working on ecosystem: {eco_name} ...\")\n",
|
||||
"\n",
|
||||
" extract_issues_to_csv(eco_name, db)\n",
|
||||
" extract_links_to_csv(eco_name, db)\n",
|
||||
"\n",
|
||||
" print(\n",
|
||||
" f\"✔ {eco_name} completely processed. Duration: {format_duration(eco_start_time, time())}\"\n",
|
||||
" )\n",
|
||||
" print(\"\")\n",
|
||||
"\n",
|
||||
" print(\n",
|
||||
" f\"✅ All completely processed. Total duration: {format_duration(start_time, time())}\"\n",
|
||||
" )"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "grad_pro_env",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
|
@ -0,0 +1,589 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "1345f1c8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from tqdm import tqdm\n",
|
||||
"from pathlib import Path\n",
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"from pymongo import MongoClient"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "46d49dd9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# 软件生态名\n",
|
||||
"ECO_NAMES = [\n",
|
||||
" # \"Apache\",\n",
|
||||
" # \"Jira\",\n",
|
||||
" # \"Mojang\",\n",
|
||||
" # \"MongoDB\",\n",
|
||||
" # \"Qt\",\n",
|
||||
" \"RedHat\",\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "03cf285e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ISSUE_DIR = Path(\"../data/raw/issues\")\n",
|
||||
"LINK_DIR = Path(\"../data/raw/links\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "61e8f96a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"PRO_ISSUE_DIR = Path(\"../data/processed/issues\")\n",
|
||||
"PRO_ISSUE_DIR.mkdir(parents=True, exist_ok=True)\n",
|
||||
"PRO_LINK_DIR = Path(\"../data/processed/links\")\n",
|
||||
"PRO_LINK_DIR.mkdir(parents=True, exist_ok=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "40885839",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def load_issues(eco_name: str):\n",
|
||||
" # 加载Issue数据DataFrame\n",
|
||||
"\n",
|
||||
" filename = ISSUE_DIR / (eco_name + \".csv\")\n",
|
||||
" issue_df = pd.read_csv(\n",
|
||||
" filename, sep=\";\", encoding=\"utf-8\", low_memory=False, index_col=[\"key\"]\n",
|
||||
" )\n",
|
||||
" return issue_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "1981c072",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def load_links(eco_name: str):\n",
|
||||
" # 加载链接数据DataFrame\n",
|
||||
"\n",
|
||||
" filename = LINK_DIR / (eco_name + \".csv\")\n",
|
||||
" link_df = pd.read_csv(filename, sep=\";\", encoding=\"utf-8\", low_memory=False)\n",
|
||||
" return link_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "865088c5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def clean_issues(issue_df: pd.DataFrame):\n",
|
||||
" # 对Issue数据进行清洗\n",
|
||||
"\n",
|
||||
" # 把时间数据转换为统一格式\n",
|
||||
" issue_df[\"created_time\"] = pd.to_datetime(\n",
|
||||
" issue_df[\"created_time\"], errors=\"coerce\"\n",
|
||||
" ).apply(lambda x: x.strftime(\"%Y-%m-%d %H:%M:%S\") if pd.notna(x) else np.nan)\n",
|
||||
"\n",
|
||||
" return issue_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "4447a7d2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def clean_links(link_df: pd.DataFrame, issue_df: pd.DataFrame):\n",
|
||||
" # 对链接数据进行清洗\n",
|
||||
"\n",
|
||||
" def column_transform(row):\n",
|
||||
" return str(sorted(set([row[\"in_issue_key\"], row[\"out_issue_key\"]])))\n",
|
||||
"\n",
|
||||
" # 一条(一般类型)链接会在两个Issue的字段中存在,需要清除其中一个副本\n",
|
||||
" link_df.drop_duplicates(inplace=True)\n",
|
||||
" print(f\"Left with {len(link_df)} links after removing link duplication\")\n",
|
||||
"\n",
|
||||
" # 清除Issue是私有的、无权访问的链接\n",
|
||||
" condition = (\n",
|
||||
" link_df[[\"in_issue_key\", \"out_issue_key\"]]\n",
|
||||
" .isin(issue_df.index.values)\n",
|
||||
" .all(axis=1)\n",
|
||||
" )\n",
|
||||
" link_df = link_df[condition]\n",
|
||||
" print(f\"Left with {len(link_df)} links after removing half-private issues\")\n",
|
||||
"\n",
|
||||
" # 一对Issue间只允许存在一条链接,需要删除含有多条链接的Issue对\n",
|
||||
" # 首先基于'link_key'字段删除重复的Issue对\n",
|
||||
" # !注意:相同的'link_key'的Issue对之间是可能存在多种类型的链接,这会混淆关联关系,所以全部清除\n",
|
||||
" link_df.drop_duplicates(subset=[\"link_key\"], keep=False, inplace=True)\n",
|
||||
"\n",
|
||||
" # 其次,以防'link_key'是反过来的,比如issue1_issue2和issue2_issue1\n",
|
||||
" # 所以添加'sorted_issue_keys'字段,由链接的两个Issue的key升序组成\n",
|
||||
" link_df[\"sorted_issue_keys\"] = link_df.apply(column_transform, axis=1)\n",
|
||||
" # 找出链接的两端Issue的keys相同的行对应的'sorted_issue_keys'字段值\n",
|
||||
" doublelinks = (\n",
|
||||
" (link_df[\"sorted_issue_keys\"].value_counts() > 1)\n",
|
||||
" .rename_axis(\"doubles\")\n",
|
||||
" .reset_index(name=\"valid\")\n",
|
||||
" )\n",
|
||||
" valid_double_keys = set(doublelinks[doublelinks[\"valid\"] == True][\"doubles\"])\n",
|
||||
"\n",
|
||||
" # 把重复的'sorted_issue_keys'字段对应的链接类型取出来检查,若类型数大于1,则清除这些Issue对\n",
|
||||
" for i in tqdm(valid_double_keys):\n",
|
||||
" if len(set(link_df[link_df[\"sorted_issue_keys\"] == i][\"link_type\"])) > 1:\n",
|
||||
" condition = link_df[\"sorted_issue_keys\"] != i\n",
|
||||
" link_df = link_df[condition]\n",
|
||||
" print(\n",
|
||||
" f\"Left with {len(link_df)} links after removing issue-pairs with multiple types of links between them\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # 最后,留下来的链接中仍然可能有重复链接类型的Issue对(通过Issue的key对调的方式实现的),清除其中一个\n",
|
||||
" link_df.drop_duplicates(subset=[\"sorted_issue_keys\"], inplace=True)\n",
|
||||
" print(\n",
|
||||
" f\"Left with {len(link_df)} links after removing issue-pairs with duplicate same type of links\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" link_df.reset_index(inplace=True, drop=True)\n",
|
||||
"\n",
|
||||
" return link_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "ab8df842",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def joined_links(link_df: pd.DataFrame, issue_df: pd.DataFrame):\n",
|
||||
" # 联合Issue和链接数据\n",
|
||||
"\n",
|
||||
" joined_df = link_df.join(issue_df.add_suffix(\"_in\"), on=\"in_issue_key\").join(\n",
|
||||
" issue_df.add_suffix(\"_out\"), on=\"out_issue_key\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # !注意:补充Subtask类型链接创建时间\n",
|
||||
" joined_df.loc[\n",
|
||||
" (joined_df[\"link_type\"] == \"Subtask\") & (joined_df[\"link_created_time\"].isna()),\n",
|
||||
" \"link_created_time\",\n",
|
||||
" ] = joined_df[\"created_time_out\"]\n",
|
||||
"\n",
|
||||
" return joined_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "7a496e37",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def query_issue_closed_time(eco_name: str, issue_df: pd.DataFrame):\n",
|
||||
" # 查询Issue的history,获取Issue关闭时间\n",
|
||||
"\n",
|
||||
" # 定义一个函数,用于处理每个分组\n",
|
||||
" def handle_group(group):\n",
|
||||
" # 如果分组内的closed_time全为NaN,则保留该分组的第一行\n",
|
||||
" if group[\"closed_time\"].isna().all():\n",
|
||||
" return group.iloc[0:1]\n",
|
||||
" # 否则,返回closed_time最大值对应的行\n",
|
||||
" else:\n",
|
||||
" return group.loc[[group[\"closed_time\"].idxmax()]]\n",
|
||||
"\n",
|
||||
" # 把索引列转换为普通列,列名为key\n",
|
||||
" issue_df = issue_df.reset_index().rename(columns={\"index\": \"key\"})\n",
|
||||
" # 创建Issue关闭时间列\n",
|
||||
" issue_df[\"closed_time\"] = None\n",
|
||||
"\n",
|
||||
" with MongoClient() as client:\n",
|
||||
" # 链接数据库\n",
|
||||
" db = client[\"JiraEcos\"]\n",
|
||||
" histories_collection = db[eco_name + \"Histories\"]\n",
|
||||
"\n",
|
||||
" # 首先取出需要查询的Issue keys并移除重复项\n",
|
||||
" issue_keys = issue_df[\"key\"].unique().tolist()\n",
|
||||
"\n",
|
||||
" # 构造聚合查询管道\n",
|
||||
" pipeline = [\n",
|
||||
" {\n",
|
||||
" # 第一步:筛选key在issue_keys列表中的文档\n",
|
||||
" \"$match\": {\"key\": {\"$in\": issue_keys}}\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" # 第二步:展开history.items数组\n",
|
||||
" # 进而返回每个具体更改事件\n",
|
||||
" \"$unwind\": \"$history.items\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" # 第三步:再次筛选满足特定field值的展开后的文档\n",
|
||||
" # field字段为status保证更改事件是修改Issue状态\n",
|
||||
" # toString字段为Closed保证是关闭Issue\n",
|
||||
" \"$match\": {\n",
|
||||
" \"history.items.field\": \"status\",\n",
|
||||
" \"history.items.toString\": \"Closed\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" # 第四步:指定返回文档的字段\n",
|
||||
" \"$project\": {\n",
|
||||
" \"_id\": 0,\n",
|
||||
" \"query_key\": \"$key\",\n",
|
||||
" \"created\": \"$history.created\",\n",
|
||||
" \"field\": \"$history.items.field\",\n",
|
||||
" \"to\": \"$history.items.to\",\n",
|
||||
" \"toString\": \"$history.items.toString\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
" # 查询数据库\n",
|
||||
" query = list(histories_collection.aggregate(pipeline))\n",
|
||||
" # 转换为DataFrame\n",
|
||||
" query_df = pd.DataFrame(query)\n",
|
||||
" # print(query_df.head())\n",
|
||||
"\n",
|
||||
" print(\n",
|
||||
" f\"❕ Test print: {len(issue_df)} issues before merged with query DataFrame\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # 合并DataFrame,基于key与query_key匹配\n",
|
||||
" merged_df = pd.merge(\n",
|
||||
" issue_df,\n",
|
||||
" query_df,\n",
|
||||
" left_on=\"key\",\n",
|
||||
" right_on=\"query_key\",\n",
|
||||
" how=\"left\",\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" print(\n",
|
||||
" f\"❕ Test print: {len(merged_df)} issues after merged with query DataFrame\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # 将merged_df中的created值赋给合并后DataFrame的closed_time字段\n",
|
||||
" merged_df[\"closed_time\"] = pd.to_datetime(merged_df[\"created\"], errors=\"coerce\")\n",
|
||||
"\n",
|
||||
" # 裁切出需要的字段\n",
|
||||
" issue_df = merged_df[list(issue_df.columns)]\n",
|
||||
"\n",
|
||||
" # 最后,由于Issue可能会被多次开启与关闭,所以,保留最后一次关闭时间\n",
|
||||
" # 按照key进行分组,使用groupby和apply处理每个分组\n",
|
||||
" result_df = (\n",
|
||||
" issue_df.groupby(\"key\", as_index=False)\n",
|
||||
" .apply(handle_group)\n",
|
||||
" .reset_index(drop=True)\n",
|
||||
" )\n",
|
||||
" # 统一时间格式\n",
|
||||
" result_df[\"closed_time\"] = result_df[\"closed_time\"].apply(\n",
|
||||
" lambda x: x.strftime(\"%Y-%m-%d %H:%M:%S\") if pd.notna(x) else np.nan\n",
|
||||
" )\n",
|
||||
" # 把key列重新设置为索引列\n",
|
||||
" result_df = result_df.set_index(\"key\")\n",
|
||||
"\n",
|
||||
" print(f\"❕ Test print: {len(result_df)} issues after processed done\")\n",
|
||||
"\n",
|
||||
" return result_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "3401742e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def query_link_created_time(eco_name: str, link_df: pd.DataFrame):\n",
|
||||
" # 查询Issue的history,获取链接创建时间\n",
|
||||
"\n",
|
||||
" # 定义一个函数,用于处理每个分组\n",
|
||||
" def handle_group(group):\n",
|
||||
" # 如果分组内的link_created_time全为NaN,则保留该分组的第一行\n",
|
||||
" if group[\"link_created_time\"].isna().all():\n",
|
||||
" return group.iloc[0:1]\n",
|
||||
" # 否则,返回link_created_time最大值对应的行\n",
|
||||
" else:\n",
|
||||
" return group.loc[[group[\"link_created_time\"].idxmax()]]\n",
|
||||
"\n",
|
||||
" # 裁切出需要的字段\n",
|
||||
" link_df = link_df[[\"link_type\", \"in_issue_key\", \"out_issue_key\"]]\n",
|
||||
" # 创建链接创建时间列\n",
|
||||
" link_df[\"link_created_time\"] = None\n",
|
||||
"\n",
|
||||
" with MongoClient() as client:\n",
|
||||
" # 链接数据库\n",
|
||||
" db = client[\"JiraEcos\"]\n",
|
||||
" histories_collection = db[eco_name + \"Histories\"]\n",
|
||||
"\n",
|
||||
" # 首先取出需要查询的Issue keys并移除重复项\n",
|
||||
" out_issue_keys = link_df[\"out_issue_key\"].tolist()\n",
|
||||
" out_issue_keys = list(set(out_issue_keys))\n",
|
||||
"\n",
|
||||
" # 构造聚合查询管道\n",
|
||||
" pipeline = [\n",
|
||||
" {\n",
|
||||
" # 第一步:筛选key在out_issue_keys列表中的文档\n",
|
||||
" # 从而保证只取出有链接的Issue的更改事件\n",
|
||||
" \"$match\": {\"key\": {\"$in\": out_issue_keys}}\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" # 第二步:展开history.items数组\n",
|
||||
" # 进而返回每个具体更改事件\n",
|
||||
" \"$unwind\": \"$history.items\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" # 第三步:再次筛选满足特定field值的展开后的文档\n",
|
||||
" # field保证更改事件是链接创建或删除\n",
|
||||
" # to或toString字段不为空保证是创建链接的事件而不是删除\n",
|
||||
" \"$match\": {\n",
|
||||
" \"history.items.field\": {\n",
|
||||
" \"$in\": [\"Link\", \"Epic Child\", \"Parent\", \"Parent Issue\"]\n",
|
||||
" },\n",
|
||||
" \"$or\": [\n",
|
||||
" {\"history.items.to\": {\"$ne\": None}},\n",
|
||||
" {\"history.items.toString\": {\"$ne\": None}},\n",
|
||||
" ],\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" # 第四步:指定返回文档的格式\n",
|
||||
" # target_key根据链接类型获取to或toString字段信息\n",
|
||||
" \"$project\": {\n",
|
||||
" \"_id\": 0,\n",
|
||||
" \"key\": 1,\n",
|
||||
" \"created\": \"$history.created\",\n",
|
||||
" \"field\": \"$history.items.field\",\n",
|
||||
" \"to\": \"$history.items.to\",\n",
|
||||
" \"toString\": \"$history.items.toString\",\n",
|
||||
" \"target_key\": {\n",
|
||||
" \"$cond\": {\n",
|
||||
" \"if\": {\"$eq\": [\"$history.items.field\", \"Link\"]},\n",
|
||||
" \"then\": \"$history.items.to\",\n",
|
||||
" \"else\": \"$history.items.toString\",\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
" # 查询数据库\n",
|
||||
" query = list(histories_collection.aggregate(pipeline))\n",
|
||||
" # 转换为DataFrame\n",
|
||||
" query_df = pd.DataFrame(query)\n",
|
||||
" # print(query_df.head())\n",
|
||||
"\n",
|
||||
" # 合并DataFrame,基于out_issue_key和key匹配,in_issue_key和target_key匹配\n",
|
||||
" merged_df = pd.merge(\n",
|
||||
" link_df,\n",
|
||||
" query_df,\n",
|
||||
" left_on=[\"out_issue_key\", \"in_issue_key\"],\n",
|
||||
" right_on=[\"key\", \"target_key\"],\n",
|
||||
" how=\"left\",\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # 将merged_df中的created值赋给合并后DataFrame的link_created_time字段\n",
|
||||
" merged_df[\"link_created_time\"] = merged_df[\"created\"]\n",
|
||||
"\n",
|
||||
" # 裁切出需要的字段\n",
|
||||
" link_df = merged_df[\n",
|
||||
" [\"link_type\", \"in_issue_key\", \"out_issue_key\", \"link_created_time\"]\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
" # 最后,由于in_issue和out_issue之间可能会发生相同类型链接的多次创建活动\n",
|
||||
" # 所以,保留最后一次链接创建时间\n",
|
||||
" # 转换link_created_time为datetime以确保比较的准确性\n",
|
||||
" link_df[\"link_created_time\"] = pd.to_datetime(\n",
|
||||
" link_df[\"link_created_time\"], errors=\"coerce\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # 按照除link_created_time以外的所有字段进行分组,使用groupby和apply处理每个分组\n",
|
||||
" result_df = (\n",
|
||||
" link_df.groupby(\n",
|
||||
" [\"link_type\", \"in_issue_key\", \"out_issue_key\"], as_index=False\n",
|
||||
" )\n",
|
||||
" .apply(handle_group)\n",
|
||||
" .reset_index(drop=True)\n",
|
||||
" )\n",
|
||||
" # 统一时间格式\n",
|
||||
" result_df[\"link_created_time\"] = result_df[\"link_created_time\"].apply(\n",
|
||||
" lambda x: x.strftime(\"%Y-%m-%d %H:%M:%S\") if pd.notna(x) else np.nan\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" return result_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "b42827a1",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"✔ Loaded 502297 raw issues and 405070 raw links for RedHat\n",
|
||||
"Left with 268935 links after removing link duplication\n",
|
||||
"Left with 249733 links after removing half-private issues\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/tmp/ipykernel_523178/4239802332.py:23: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||
" link_df.drop_duplicates(subset=[\"link_key\"], keep=False, inplace=True)\n",
|
||||
"/tmp/ipykernel_523178/4239802332.py:27: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||
" link_df[\"sorted_issue_keys\"] = link_df.apply(column_transform, axis=1)\n",
|
||||
"100%|██████████| 4004/4004 [01:48<00:00, 36.81it/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Left with 238349 links after removing issue-pairs with multiple types of links between them\n",
|
||||
"Left with 238053 links after removing issue-pairs with duplicate same type of links\n",
|
||||
"✔ Cleaned 502297 issues for RedHat\n",
|
||||
"✔ Cleaned 238053 links for RedHat\n",
|
||||
"✔ Link type distribution:\n",
|
||||
"link_type\n",
|
||||
"Epic 67799\n",
|
||||
"Subtask 45020\n",
|
||||
"Related 44222\n",
|
||||
"Cloners 29629\n",
|
||||
"Blocks 21106\n",
|
||||
"Incorporates 12847\n",
|
||||
"Duplicate 7080\n",
|
||||
"Causality 4122\n",
|
||||
"Depend 2849\n",
|
||||
"Document 1652\n",
|
||||
"Issue split 694\n",
|
||||
"Account 568\n",
|
||||
"Triggers 465\n",
|
||||
"Name: count, dtype: int64\n",
|
||||
"❕ Test print: 502297 issues before merged with query DataFrame\n",
|
||||
"❕ Test print: 561112 issues after merged with query DataFrame\n",
|
||||
"❕ Test print: 502297 issues after processed done\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/tmp/ipykernel_523178/834440382.py:16: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||
" link_df[\"link_created_time\"] = None\n",
|
||||
"/tmp/ipykernel_523178/834440382.py:100: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||
" link_df[\"link_created_time\"] = pd.to_datetime(\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"✅ ----------------------------\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for eco_name in ECO_NAMES:\n",
|
||||
" # 加载Issue和链接数据DataFrame\n",
|
||||
" issue_df = load_issues(eco_name)\n",
|
||||
" link_df = load_links(eco_name)\n",
|
||||
" print(\n",
|
||||
" f\"✔ Loaded {len(issue_df)} raw issues and {len(link_df)} raw links for {eco_name}\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # 对Issue和链接数据进行清理\n",
|
||||
" issue_df = clean_issues(issue_df)\n",
|
||||
" link_df = clean_links(link_df, issue_df)\n",
|
||||
" print(f\"✔ Cleaned {len(issue_df)} issues for {eco_name}\")\n",
|
||||
" print(f\"✔ Cleaned {len(link_df)} links for {eco_name}\")\n",
|
||||
"\n",
|
||||
" # 打印不同链接类型分布\n",
|
||||
" print(\"✔ Link type distribution:\")\n",
|
||||
" print(link_df[\"link_type\"].value_counts())\n",
|
||||
"\n",
|
||||
" # 添加Issue关闭时间\n",
|
||||
" issue_df = query_issue_closed_time(eco_name, issue_df)\n",
|
||||
"\n",
|
||||
" # 添加链接创建时间\n",
|
||||
" link_df = query_link_created_time(eco_name, link_df)\n",
|
||||
"\n",
|
||||
" # 联合Issue和链接数据\n",
|
||||
" link_df = joined_links(link_df, issue_df)\n",
|
||||
"\n",
|
||||
" # 保存清理后的Issue和链接数据\n",
|
||||
" issue_df.to_csv(\n",
|
||||
" PRO_ISSUE_DIR / (eco_name + \".csv\"),\n",
|
||||
" sep=\";\",\n",
|
||||
" index=True, #! issue_df的key被设置为了索引列,所以这里需要保存\n",
|
||||
" )\n",
|
||||
" link_df.to_csv(\n",
|
||||
" PRO_LINK_DIR / (eco_name + \".csv\"),\n",
|
||||
" sep=\";\",\n",
|
||||
" index=False,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" print(\"✅ ----------------------------\\n\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.18"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue