完成实证分析部分实验

2024-03-23 16:41:55 +08:00 · 2024-03-23 16:41:55 +08:00 · 6d8ab9c802
parent cbf41e55be
commit 6d8ab9c802
44 changed files with 28861 additions and 0 deletions
--- a/0_data_crawl/data/eco_issue_fields.json
+++ b/0_data_crawl/data/eco_issue_fields.json
--- a/0_data_crawl/data/eco_issue_types.json
+++ b/0_data_crawl/data/eco_issue_types.json
--- a/0_data_crawl/data/eco_link_types.json
+++ b/0_data_crawl/data/eco_link_types.json
@ -0,0 +1,341 @@
+{
+    "Apache": {
+        "Blocked": {
+            "id": "12310361",
+            "name": "Blocked",
+            "inward": "Blocked",
+            "outward": "Blocked",
+            "self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310361"
+        },
+        "Blocker": {
+            "id": "10032",
+            "name": "Blocker",
+            "inward": "is blocked by",
+            "outward": "blocks",
+            "self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/10032"
+        },
+        "Child-Issue": {
+            "id": "12310460",
+            "name": "Child-Issue",
+            "inward": "is a child of",
+            "outward": "is a parent of",
+            "self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310460"
+        },
+        "Cloners": {
+            "id": "10020",
+            "name": "Cloners",
+            "inward": "is cloned by",
+            "outward": "is a clone of",
+            "self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/10020"
+        },
+        "Completes": {
+            "id": "12310660",
+            "name": "Completes",
+            "inward": "is fixed by",
+            "outward": "fixes",
+            "self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310660"
+        },
+        "Container": {
+            "id": "12310060",
+            "name": "Container",
+            "inward": "Is contained by",
+            "outward": "contains",
+            "self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310060"
+        },
+        "Dependency": {
+            "id": "12310461",
+            "name": "Dependency",
+            "inward": "Dependency",
+            "outward": "Dependency",
+            "self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310461"
+        },
+        "Dependent": {
+            "id": "12310360",
+            "name": "Dependent",
+            "inward": "Dependent",
+            "outward": "Dependent",
+            "self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310360"
+        },
+        "Duplicate": {
+            "id": "12310000",
+            "name": "Duplicate",
+            "inward": "is duplicated by",
+            "outward": "duplicates",
+            "self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310000"
+        },
+        "Incorporates": {
+            "id": "12310010",
+            "name": "Incorporates",
+            "inward": "is part of",
+            "outward": "incorporates",
+            "self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310010"
+        },
+        "Issue split": {
+            "id": "12310761",
+            "name": "Issue split",
+            "inward": "split from",
+            "outward": "split to",
+            "self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310761"
+        },
+        "Parent Feature": {
+            "id": "12310462",
+            "name": "Parent Feature",
+            "inward": "Parent Feature",
+            "outward": "Parent Feature",
+            "self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310462"
+        },
+        "Problem/Incident": {
+            "id": "12310560",
+            "name": "Problem/Incident",
+            "inward": "is caused by",
+            "outward": "causes",
+            "self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310560"
+        },
+        "Reference": {
+            "id": "10030",
+            "name": "Reference",
+            "inward": "is related to",
+            "outward": "relates to",
+            "self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/10030"
+        },
+        "Regression": {
+            "id": "12310050",
+            "name": "Regression",
+            "inward": "is broken by",
+            "outward": "breaks",
+            "self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310050"
+        },
+        "Related": {
+            "id": "12310260",
+            "name": "Related",
+            "inward": "is related to",
+            "outward": "relates to",
+            "self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310260"
+        },
+        "Required": {
+            "id": "12310040",
+            "name": "Required",
+            "inward": "is required by",
+            "outward": "requires",
+            "self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310040"
+        },
+        "Supercedes": {
+            "id": "12310051",
+            "name": "Supercedes",
+            "inward": "is superceded by",
+            "outward": "supercedes",
+            "self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310051"
+        },
+        "Testing": {
+            "id": "12310760",
+            "name": "Testing",
+            "inward": "Discovered while testing",
+            "outward": "Testing discovered",
+            "self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310760"
+        },
+        "dependent": {
+            "id": "10001",
+            "name": "dependent",
+            "inward": "is depended upon by",
+            "outward": "depends upon",
+            "self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/10001"
+        }
+    },
+    "Jira": {},
+    "Mojang": {
+        "Blocks": {
+            "id": "10100",
+            "name": "Blocks",
+            "inward": "is blocked by",
+            "outward": "blocks",
+            "self": "https://bugs.mojang.com/rest/api/2/issueLinkType/10100"
+        },
+        "Bonfire Testing": {
+            "id": "10000",
+            "name": "Bonfire Testing",
+            "inward": "discovered while testing",
+            "outward": "testing discovered",
+            "self": "https://bugs.mojang.com/rest/api/2/issueLinkType/10000"
+        },
+        "Cloners": {
+            "id": "10101",
+            "name": "Cloners",
+            "inward": "is cloned by",
+            "outward": "clones",
+            "self": "https://bugs.mojang.com/rest/api/2/issueLinkType/10101"
+        },
+        "Duplicate": {
+            "id": "10102",
+            "name": "Duplicate",
+            "inward": "is duplicated by",
+            "outward": "duplicates",
+            "self": "https://bugs.mojang.com/rest/api/2/issueLinkType/10102"
+        },
+        "Problem/Incident": {
+            "id": "10500",
+            "name": "Problem/Incident",
+            "inward": "is caused by",
+            "outward": "causes",
+            "self": "https://bugs.mojang.com/rest/api/2/issueLinkType/10500"
+        },
+        "Relates": {
+            "id": "10103",
+            "name": "Relates",
+            "inward": "relates to",
+            "outward": "relates to",
+            "self": "https://bugs.mojang.com/rest/api/2/issueLinkType/10103"
+        }
+    },
+    "MongoDB": {},
+    "Qt": {
+        "Blocks": {
+            "id": "10282",
+            "name": "Blocks",
+            "inward": "is blocked by",
+            "outward": "blocks",
+            "self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10282"
+        },
+        "Cloners": {
+            "id": "10281",
+            "name": "Cloners",
+            "inward": "is cloned by",
+            "outward": "clones",
+            "self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10281"
+        },
+        "Covered": {
+            "id": "10381",
+            "name": "Covered",
+            "inward": "is covered by",
+            "outward": "covers",
+            "self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10381"
+        },
+        "Dependency": {
+            "id": "10001",
+            "name": "Dependency",
+            "inward": "is required for",
+            "outward": "depends on",
+            "self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10001"
+        },
+        "Duplicate": {
+            "id": "10180",
+            "name": "Duplicate",
+            "inward": "is duplicated by",
+            "outward": "duplicates",
+            "self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10180"
+        },
+        "Issue split": {
+            "id": "10280",
+            "name": "Issue split",
+            "inward": "split from",
+            "outward": "split to",
+            "self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10280"
+        },
+        "Relates": {
+            "id": "10070",
+            "name": "Relates",
+            "inward": "relates to",
+            "outward": "relates to",
+            "self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10070"
+        },
+        "Replacement": {
+            "id": "10031",
+            "name": "Replacement",
+            "inward": "replaces",
+            "outward": "is replaced by",
+            "self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10031"
+        },
+        "Test": {
+            "id": "10020",
+            "name": "Test",
+            "inward": "Is tested by",
+            "outward": "tests",
+            "self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10020"
+        },
+        "Work Breakdown": {
+            "id": "10040",
+            "name": "Work Breakdown",
+            "inward": "resulted from",
+            "outward": "resulted in",
+            "self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10040"
+        }
+    },
+    "RedHat": {
+        "Account": {
+            "id": "12310920",
+            "name": "Account",
+            "inward": "account is impacted by",
+            "outward": "impacts account",
+            "self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310920"
+        },
+        "Blocks": {
+            "id": "12310720",
+            "name": "Blocks",
+            "inward": "is blocked by",
+            "outward": "blocks",
+            "self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310720"
+        },
+        "Causality": {
+            "id": "12310220",
+            "name": "Causality",
+            "inward": "is caused by",
+            "outward": "causes",
+            "self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310220"
+        },
+        "Cloners": {
+            "id": "12310120",
+            "name": "Cloners",
+            "inward": "is cloned by",
+            "outward": "clones",
+            "self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310120"
+        },
+        "Depend": {
+            "id": "12311220",
+            "name": "Depend",
+            "inward": "is depended on by",
+            "outward": "depends on",
+            "self": "https://issues.redhat.com/rest/api/2/issueLinkType/12311220"
+        },
+        "Document": {
+            "id": "12310420",
+            "name": "Document",
+            "inward": "is documented by",
+            "outward": "documents",
+            "self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310420"
+        },
+        "Duplicate": {
+            "id": "12310000",
+            "name": "Duplicate",
+            "inward": "is duplicated by",
+            "outward": "duplicates",
+            "self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310000"
+        },
+        "Incorporates": {
+            "id": "10011",
+            "name": "Incorporates",
+            "inward": "is incorporated by",
+            "outward": "incorporates",
+            "self": "https://issues.redhat.com/rest/api/2/issueLinkType/10011"
+        },
+        "Issue split": {
+            "id": "12311720",
+            "name": "Issue split",
+            "inward": "split from",
+            "outward": "split to",
+            "self": "https://issues.redhat.com/rest/api/2/issueLinkType/12311720"
+        },
+        "Related": {
+            "id": "12310001",
+            "name": "Related",
+            "inward": "is related to",
+            "outward": "relates to",
+            "self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310001"
+        },
+        "Triggers": {
+            "id": "12310723",
+            "name": "Triggers",
+            "inward": "is triggered by",
+            "outward": "is triggering",
+            "self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310723"
+        }
+    }
+}
--- a/0_data_crawl/data/ecos_overview.csv
+++ b/0_data_crawl/data/ecos_overview.csv
@ -0,0 +1,7 @@
+;Year;Issues;DIT;UIT;Links;DLT;ULT;UP;Changes;Ch/I;Comments;Co/I
+RedHat;2001.0;502297.0;79.0;64.0;268935.0;11.0;11.0;807.0;7197717.0;14.0;1115471.0;2.0
+Qt;2005.0;180574.0;15.0;15.0;58621.0;10.0;10.0;60.0;2307707.0;13.0;507214.0;3.0
+Sum;;682871.0;94.0;79.0;327556.0;21.0;21.0;867.0;9505424.0;27.0;1622685.0;5.0
+Mean;;341435.5;47.0;39.5;163778.0;10.5;10.5;433.5;4752712.0;13.5;811342.5;2.5
+Median;;341435.5;47.0;39.5;163778.0;10.5;10.5;433.5;4752712.0;13.5;811342.5;2.5
+Std Dev;;160861.5;32.0;24.5;105157.0;0.5;0.5;373.5;2445005.0;0.5;304128.5;0.5
--- a/0_data_crawl/scripts/data_crawl.ipynb
+++ b/0_data_crawl/scripts/data_crawl.ipynb
--- a/0_data_crawl/scripts/overview_analysis.ipynb
+++ b/0_data_crawl/scripts/overview_analysis.ipynb
@ -0,0 +1,734 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "e35d2209-3e5b-4cd7-a702-2eed1badf800",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2022-01-25T09:42:06.183120Z",
+     "iopub.status.busy": "2022-01-25T09:42:06.182949Z",
+     "iopub.status.idle": "2022-01-25T09:42:06.839486Z",
+     "shell.execute_reply": "2022-01-25T09:42:06.838906Z",
+     "shell.execute_reply.started": "2022-01-25T09:42:06.183099Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from time import time\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from pymongo import MongoClient\n",
+    "from statistics import mean, median\n",
+    "\n",
+    "# 确保DataFrame的列长不会被截断\n",
+    "pd.set_option(\"display.max_colwidth\", None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "cb035275-5360-43cc-8dec-e7d1df4c7417",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2022-01-25T09:42:06.842313Z",
+     "iopub.status.busy": "2022-01-25T09:42:06.841976Z",
+     "iopub.status.idle": "2022-01-25T09:42:06.867490Z",
+     "shell.execute_reply": "2022-01-25T09:42:06.866866Z",
+     "shell.execute_reply.started": "2022-01-25T09:42:06.842270Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# 加载Jira软件生态元数据\n",
+    "with open(\"../data/jira_ecos_info.json\") as f:\n",
+    "    jira_ecos_info = json.load(f)\n",
+    "\n",
+    "# 加载生态中使用的Issue类型信息（使用`data_crawl.ipynb`下载）\n",
+    "with open(\"../data/eco_issue_types.json\") as f:\n",
+    "    eco_issue_types = json.load(f)\n",
+    "\n",
+    "# 加载生态中使用的链接类型信息（使用`data_crawl.ipynb`下载）\n",
+    "with open(\"../data/eco_link_types.json\") as f:\n",
+    "    eco_link_types = json.load(f)\n",
+    "\n",
+    "# 连接到数据库\n",
+    "client = MongoClient()\n",
+    "db = client[\"JiraEcos\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "07c7b0c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ALL_ECOS = [name for name in jira_eco_sources.keys()]\n",
+    "ALL_ECOS = [\"RedHat\", \"Qt\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "6897e14d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 格式化时间间隔\n",
+    "def format_duration(start_time, end_time):\n",
+    "    # 计算总秒数\n",
+    "    seconds = end_time - start_time\n",
+    "    # 计算分钟和小时数\n",
+    "    minutes = int(seconds / 60)\n",
+    "    hours = int(minutes / 60)\n",
+    "    display_minutes = int(minutes % 60)\n",
+    "    display_seconds = int(seconds % 60)\n",
+    "\n",
+    "    return f\"{hours:02}:{display_minutes:02}:{display_seconds:02}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "d1f407e3-d97b-4125-9723-35b613b42534",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2022-01-25T09:42:06.923021Z",
+     "iopub.status.busy": "2022-01-25T09:42:06.922589Z",
+     "iopub.status.idle": "2022-01-25T09:42:06.928681Z",
+     "shell.execute_reply": "2022-01-25T09:42:06.927331Z",
+     "shell.execute_reply.started": "2022-01-25T09:42:06.922989Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "ecos_df = pd.DataFrame(\n",
+    "    np.nan,\n",
+    "    columns=[\n",
+    "        \"Year\",  # 生态创建时间\n",
+    "        \"Issues\",  # Issue总数\n",
+    "        \"DIT\",  # Documented Issue Types，登记的Issue类型数\n",
+    "        \"UIT\",  # Used Issue Types，使用的Issue类型数\n",
+    "        \"Links\",  # 链接总数\n",
+    "        \"DLT\",  # Documented Link Types，登记的链接类型数\n",
+    "        \"ULT\",  # Used Link Types，使用的链接类型数\n",
+    "        \"UP\",  # Unique Projects，项目总数\n",
+    "        \"Changes\",  # 更改总数\n",
+    "        \"Ch/I\",  # Changes/Issues\n",
+    "        \"Comments\",  # 评论总数\n",
+    "        \"Co/I\",  # Comments/Issues\n",
+    "    ],\n",
+    "    index=ALL_ECOS + [\"Sum\", \"Mean\", \"Median\", \"Std Dev\"],  # 总和、均值、中值、标准差\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "af17b5f7-adea-462a-a6bd-e0bf36290781",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2022-01-25T09:42:07.611366Z",
+     "iopub.status.busy": "2022-01-25T09:42:07.611062Z",
+     "iopub.status.idle": "2022-01-25T11:06:32.931020Z",
+     "shell.execute_reply": "2022-01-25T11:06:32.923594Z",
+     "shell.execute_reply.started": "2022-01-25T09:42:07.611342Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def populate_ecos_df(ecos_df, eco_names=ALL_ECOS):\n",
+    "    # 填充DataFrame\n",
+    "\n",
+    "    def extract_number_of_issues(eco_name):\n",
+    "        # 查询Issue总数\n",
+    "        issues_collection = db[eco_name]\n",
+    "        num_issues = issues_collection.count_documents({})\n",
+    "\n",
+    "        return num_issues\n",
+    "\n",
+    "    def extract_number_of_doc_issuetypes(eco_name):\n",
+    "        # 查询记录的Issue类型数\n",
+    "        return len(eco_issue_types[eco_name])\n",
+    "\n",
+    "    def extract_number_of_used_issuetypes(eco_name):\n",
+    "        # 查询在最后状态下的Issue类型\n",
+    "        issues_collection = db[eco_name]\n",
+    "        final_types_query = list(\n",
+    "            issues_collection.aggregate(\n",
+    "                [\n",
+    "                    # 取出'$fields.issuetype.name'字段，并重命名\n",
+    "                    {\n",
+    "                        \"$project\": {\n",
+    "                            \"_id\": 0,\n",
+    "                            \"issuetype_name\": \"$fields.issuetype.name\",\n",
+    "                        }\n",
+    "                    },\n",
+    "                    # 分组，把所有Issue类型名放入一个集合\n",
+    "                    {\n",
+    "                        \"$group\": {\n",
+    "                            \"_id\": None,\n",
+    "                            \"issuetype_names\": {\"$addToSet\": \"$issuetype_name\"},\n",
+    "                        }\n",
+    "                    },\n",
+    "                ]\n",
+    "            )\n",
+    "        )\n",
+    "        unique_final_issuetypes = (\n",
+    "            set(final_types_query[0][\"issuetype_names\"])\n",
+    "            if final_types_query != []\n",
+    "            else set()\n",
+    "        )\n",
+    "\n",
+    "        # 查询在历史中使用过的Issue类型\n",
+    "        histories_collection = db[eco_name + \"Histories\"]\n",
+    "        history_types_query = list(\n",
+    "            histories_collection.aggregate(\n",
+    "                [\n",
+    "                    # 展开'$history.items'数组\n",
+    "                    {\"$unwind\": \"$history.items\"},\n",
+    "                    # 筛选更改项item的域为'issuetype'的文档\n",
+    "                    {\"$match\": {\"history.items.field\": \"issuetype\"}},\n",
+    "                    # 取出item的'fromString'，即更改前的Issue类型\n",
+    "                    # !注意：更改后的Issue类型'toString'会在下一次更改中作为更改前的值\n",
+    "                    {\n",
+    "                        \"$project\": {\n",
+    "                            \"_id\": 0,\n",
+    "                            \"issuetype_name\": \"$history.items.fromString\",\n",
+    "                        }\n",
+    "                    },\n",
+    "                    # 分组，把所有Issue类型名放入一个集合\n",
+    "                    {\n",
+    "                        \"$group\": {\n",
+    "                            \"_id\": None,\n",
+    "                            \"issuetype_names\": {\"$addToSet\": \"$issuetype_name\"},\n",
+    "                        }\n",
+    "                    },\n",
+    "                ]\n",
+    "            )\n",
+    "        )\n",
+    "        unique_history_issuetypes = (\n",
+    "            set(history_types_query[0][\"issuetype_names\"])\n",
+    "            if history_types_query != []\n",
+    "            else set()\n",
+    "        )\n",
+    "        # 合并两个集合\n",
+    "        return len(set.union(unique_final_issuetypes, unique_history_issuetypes))\n",
+    "\n",
+    "    def extract_number_of_links(eco_name):\n",
+    "        issues_collection = db[eco_name]\n",
+    "        # 查询链接总数\n",
+    "        links_query = list(\n",
+    "            issues_collection.aggregate(\n",
+    "                [\n",
+    "                    # 筛选'$fields.issuelinks'字段非空的文档\n",
+    "                    {\"$match\": {\"fields.issuelinks\": {\"$exists\": True, \"$ne\": []}}},\n",
+    "                    # 取出issuelink的id字段（数组）\n",
+    "                    {\n",
+    "                        \"$project\": {\n",
+    "                            \"_id\": 0,\n",
+    "                            \"issuelink_ids_issue\": \"$fields.issuelinks.id\",\n",
+    "                        }\n",
+    "                    },\n",
+    "                    # 把id字段数组展开\n",
+    "                    {\"$unwind\": \"$issuelink_ids_issue\"},\n",
+    "                    # 统计链接的id\n",
+    "                    {\n",
+    "                        \"$group\": {\n",
+    "                            \"_id\": None,\n",
+    "                            \"issuelink_unique_ids\": {\n",
+    "                                \"$addToSet\": \"$issuelink_ids_issue\"\n",
+    "                            },\n",
+    "                        }\n",
+    "                    },\n",
+    "                ]\n",
+    "            )\n",
+    "        )\n",
+    "        num_issuelinks = (\n",
+    "            len(set(links_query[0][\"issuelink_unique_ids\"])) if links_query != [] else 0\n",
+    "        )\n",
+    "\n",
+    "        # 查询subtask链接总数\n",
+    "        subtasks_query = list(\n",
+    "            issues_collection.aggregate(\n",
+    "                [\n",
+    "                    # 筛选'$fields.subtasks'字段非空的文档\n",
+    "                    {\"$match\": {\"fields.subtasks\": {\"$exists\": True, \"$ne\": []}}},\n",
+    "                    # 计算Issue的subtask数量\n",
+    "                    {\n",
+    "                        \"$project\": {\n",
+    "                            \"_id\": 0,\n",
+    "                            \"num_issue_subtasks\": {\"$size\": \"$fields.subtasks\"},\n",
+    "                        }\n",
+    "                    },\n",
+    "                    # 计算整个集合内subtask数量\n",
+    "                    {\n",
+    "                        \"$group\": {\n",
+    "                            \"_id\": None,\n",
+    "                            \"num_subtasks\": {\"$sum\": \"$num_issue_subtasks\"},\n",
+    "                        }\n",
+    "                    },\n",
+    "                ]\n",
+    "            )\n",
+    "        )\n",
+    "        num_subtasks = subtasks_query[0][\"num_subtasks\"] if subtasks_query != [] else 0\n",
+    "\n",
+    "        # 查询epic链接总数\n",
+    "        # epic链接字段是自定义的\n",
+    "        EPICLINK_FIELD_DICT = {\n",
+    "            \"Apache\": \"customfield_12311120\",\n",
+    "            \"Jira\": \"customfield_12931\",\n",
+    "            \"Mojang\": \"customfield_11602\",\n",
+    "            \"MongoDB\": \"customfield_10857\",\n",
+    "            \"Qt\": \"customfield_10400\",\n",
+    "            \"RedHat\": \"customfield_12311140\",\n",
+    "        }\n",
+    "        epiclinks_query = list(\n",
+    "            issues_collection.aggregate(\n",
+    "                [\n",
+    "                    # 把自定义epic链接字段统一重命名为'epiclink_field'\n",
+    "                    {\n",
+    "                        \"$project\": {\n",
+    "                            \"epiclink_field\": f\"$fields.{EPICLINK_FIELD_DICT[eco_name]}\"\n",
+    "                        }\n",
+    "                    },\n",
+    "                    # 筛选epiclink字段非空的文档\n",
+    "                    # !注意：epic链接是由子Issue指向父Issue的\n",
+    "                    {\"$match\": {\"epiclink_field\": {\"$exists\": True, \"$ne\": None}}},\n",
+    "                    # 统计聚合的文档总数\n",
+    "                    {\"$count\": \"num_epiclinks\"},\n",
+    "                ]\n",
+    "            )\n",
+    "        )\n",
+    "        num_epiclinks = (\n",
+    "            epiclinks_query[0][\"num_epiclinks\"] if epiclinks_query != [] else 0\n",
+    "        )\n",
+    "\n",
+    "        return sum([num_issuelinks, num_subtasks, num_epiclinks])\n",
+    "\n",
+    "    def extract_number_of_doc_linktypes(eco_name):\n",
+    "        # 查询记录的链接类型数\n",
+    "        return len(eco_link_types[eco_name])\n",
+    "\n",
+    "    def extract_number_of_used_linktypes(eco_name):\n",
+    "        issues_collection = db[eco_name]\n",
+    "        # 查询在最后状态下的链接类型\n",
+    "        final_linktypes_query = list(\n",
+    "            issues_collection.aggregate(\n",
+    "                [\n",
+    "                    # 展开issuelinks数组\n",
+    "                    {\"$unwind\": \"$fields.issuelinks\"},\n",
+    "                    # 选择链接类型名字段\n",
+    "                    {\n",
+    "                        \"$project\": {\n",
+    "                            \"_id\": 0,\n",
+    "                            \"linktype_name\": \"$fields.issuelinks.type.name\",\n",
+    "                        }\n",
+    "                    },\n",
+    "                    # 分组，把所有链接类型名加入集合\n",
+    "                    {\n",
+    "                        \"$group\": {\n",
+    "                            \"_id\": None,\n",
+    "                            \"linktype_names\": {\"$addToSet\": \"$linktype_name\"},\n",
+    "                        }\n",
+    "                    },\n",
+    "                ]\n",
+    "            )\n",
+    "        )\n",
+    "\n",
+    "        return (\n",
+    "            len(set(final_linktypes_query[0][\"linktype_names\"]))\n",
+    "            if final_linktypes_query != []\n",
+    "            else 0\n",
+    "        )\n",
+    "\n",
+    "    def extract_born(eco_name):\n",
+    "        issues_collection = db[eco_name]\n",
+    "        # 取出最初的N个Issue创建时间，检查生态的最早创建时间\n",
+    "        created_dates = [\n",
+    "            issue[\"fields\"][\"created\"]\n",
+    "            for issue in issues_collection.aggregate(\n",
+    "                [\n",
+    "                    # 取出Issue创建时间\n",
+    "                    {\"$project\": {\"_id\": 0, \"fields.created\": 1}},\n",
+    "                    # 按创建时间升序排列\n",
+    "                    {\"$sort\": {\"fields.created\": 1}},\n",
+    "                    # 实际中，有些Issue会损坏或者是测试Issue，所以需要手动检查创建时间\n",
+    "                    {\"$limit\": 500},\n",
+    "                ]\n",
+    "            )\n",
+    "        ]\n",
+    "        # 手动检查创建时间，把损坏的或测试Issue的创建时间略过\n",
+    "        if eco_name == \"Apache\":\n",
+    "            created_dates = created_dates[289:]\n",
+    "        elif eco_name == \"Jira\":\n",
+    "            created_dates = created_dates[1:]\n",
+    "        elif eco_name == \"Qt\":\n",
+    "            created_dates = created_dates[7:]\n",
+    "\n",
+    "        return float(created_dates[0][:4])\n",
+    "\n",
+    "    def extract_number_of_changes(eco_name):\n",
+    "        # 查询更改总数\n",
+    "        histories_collection = db[eco_name + \"Histories\"]\n",
+    "        changes_query = list(\n",
+    "            histories_collection.aggregate(\n",
+    "                [\n",
+    "                    # 取出更改对应的域（数组）\n",
+    "                    {\"$project\": {\"_id\": 0, \"history.items.field\": 1}},\n",
+    "                    # 把更改数组展开\n",
+    "                    {\"$unwind\": \"$history.items\"},\n",
+    "                    # 统计更改总数\n",
+    "                    {\"$count\": \"num_changes\"},\n",
+    "                ]\n",
+    "            )\n",
+    "        )\n",
+    "\n",
+    "        return changes_query[0][\"num_changes\"] if changes_query != [] else 0\n",
+    "\n",
+    "    def extract_number_of_unique_projects(eco_name):\n",
+    "        # 查询在最后状态下的项目名\n",
+    "        issues_collection = db[eco_name]\n",
+    "        final_projects_query = list(\n",
+    "            issues_collection.aggregate(\n",
+    "                [\n",
+    "                    {\"$project\": {\"_id\": 0, \"project_name\": \"$fields.project.name\"}},\n",
+    "                    {\n",
+    "                        \"$group\": {\n",
+    "                            \"_id\": None,\n",
+    "                            \"project_names\": {\"$addToSet\": \"$project_name\"},\n",
+    "                        }\n",
+    "                    },\n",
+    "                ]\n",
+    "            )\n",
+    "        )\n",
+    "        unique_final_projects = (\n",
+    "            set(final_projects_query[0][\"project_names\"])\n",
+    "            if final_projects_query != []\n",
+    "            else set()\n",
+    "        )\n",
+    "\n",
+    "        # 查询在历史中的项目名\n",
+    "        histories_collection = db[eco_name + \"Histories\"]\n",
+    "        history_projects_query = list(\n",
+    "            histories_collection.aggregate(\n",
+    "                [\n",
+    "                    {\"$unwind\": \"$history.items\"},\n",
+    "                    {\n",
+    "                        \"$match\": {\n",
+    "                            \"history.items.field\": {\"$in\": [\"project\", \"Project\"]}\n",
+    "                        }\n",
+    "                    },\n",
+    "                    {\n",
+    "                        \"$project\": {\n",
+    "                            \"_id\": 0,\n",
+    "                            \"project_name\": \"$history.items.fromString\",\n",
+    "                        }\n",
+    "                    },\n",
+    "                    {\n",
+    "                        \"$group\": {\n",
+    "                            \"_id\": None,\n",
+    "                            \"project_names\": {\"$addToSet\": \"$project_name\"},\n",
+    "                        }\n",
+    "                    },\n",
+    "                ]\n",
+    "            )\n",
+    "        )\n",
+    "        unique_history_projects = (\n",
+    "            set(history_projects_query[0][\"project_names\"])\n",
+    "            if history_projects_query != []\n",
+    "            else set()\n",
+    "        )\n",
+    "\n",
+    "        return len(set.union(unique_final_projects, unique_history_projects))\n",
+    "\n",
+    "    def extract_number_of_comments(eco_name):\n",
+    "        # 查询评论总数\n",
+    "        comments_collection = db[eco_name + \"Comments\"]\n",
+    "        num_comments = comments_collection.count_documents({})\n",
+    "        return num_comments\n",
+    "\n",
+    "    start_time = time()  # 记录总处理时间\n",
+    "    for eco_name in eco_names:\n",
+    "        eco_start_time = time()  # 记录处理一个ecosystem的时间\n",
+    "        print(f\"Working on ecosystem: {eco_name} ...\")\n",
+    "\n",
+    "        # Issue总数及类型\n",
+    "        ecos_df.loc[eco_name, \"Issues\"] = extract_number_of_issues(eco_name)\n",
+    "        ecos_df.loc[eco_name, \"DIT\"] = extract_number_of_doc_issuetypes(eco_name)\n",
+    "        ecos_df.loc[eco_name, \"UIT\"] = extract_number_of_used_issuetypes(eco_name)\n",
+    "\n",
+    "        # Link总数及类型\n",
+    "        ecos_df.loc[eco_name, \"Links\"] = extract_number_of_links(eco_name)\n",
+    "        ecos_df.loc[eco_name, \"DLT\"] = extract_number_of_doc_linktypes(eco_name)\n",
+    "        ecos_df.loc[eco_name, \"ULT\"] = extract_number_of_used_linktypes(eco_name)\n",
+    "\n",
+    "        # 其他信息\n",
+    "        ecos_df.loc[eco_name, \"Year\"] = extract_born(eco_name)\n",
+    "        ecos_df.loc[eco_name, \"Changes\"] = extract_number_of_changes(eco_name)\n",
+    "        ecos_df.loc[eco_name, \"Ch/I\"] = round(\n",
+    "            ecos_df.loc[eco_name, \"Changes\"] / ecos_df.loc[eco_name, \"Issues\"]\n",
+    "        )\n",
+    "        ecos_df.loc[eco_name, \"UP\"] = extract_number_of_unique_projects(eco_name)\n",
+    "        ecos_df.loc[eco_name, \"Comments\"] = extract_number_of_comments(eco_name)\n",
+    "        ecos_df.loc[eco_name, \"Co/I\"] = round(\n",
+    "            ecos_df.loc[eco_name, \"Comments\"] / ecos_df.loc[eco_name, \"Issues\"]\n",
+    "        )\n",
+    "        print(\n",
+    "            f\"✔ {eco_name} completely processed. Duration: {format_duration(eco_start_time, time())}\"\n",
+    "        )\n",
+    "        print(\"\")\n",
+    "\n",
+    "    print(\n",
+    "        f\"✅ All completely processed. Total duration: {format_duration(start_time, time())}\"\n",
+    "    )\n",
+    "    return ecos_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "1698189d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Working on ecosystem: RedHat ...\n",
+      "✔ RedHat completely processed. Duration: 00:00:45\n",
+      "\n",
+      "Working on ecosystem: Qt ...\n",
+      "✔ Qt completely processed. Duration: 00:00:11\n",
+      "\n",
+      "✅ All completely processed. Total duration: 00:00:56\n"
+     ]
+    }
+   ],
+   "source": [
+    "ecos_df = populate_ecos_df(\n",
+    "    ecos_df,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "fc015a43-fd77-46bd-8712-b65476b36d3c",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2022-01-25T11:06:33.006871Z",
+     "iopub.status.busy": "2022-01-25T11:06:33.006437Z",
+     "iopub.status.idle": "2022-01-25T11:06:33.162697Z",
+     "shell.execute_reply": "2022-01-25T11:06:33.161886Z",
+     "shell.execute_reply.started": "2022-01-25T11:06:33.006832Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def display_ecos_df(ecos_df):\n",
+    "\n",
+    "    # 计算各列总和、均值、中值以及标准差\n",
+    "    for header in ecos_df.columns:\n",
+    "        if header in [\"Year\"]:\n",
+    "            continue\n",
+    "        ecos_df.loc[\"Sum\", header] = sum(ecos_df[header][: len(ALL_ECOS)])\n",
+    "        ecos_df.loc[\"Mean\", header] = mean(ecos_df[header][: len(ALL_ECOS)])\n",
+    "        ecos_df.loc[\"Median\", header] = median(ecos_df[header][: len(ALL_ECOS)])\n",
+    "        ecos_df.loc[\"Std Dev\", header] = np.std(ecos_df[header][: len(ALL_ECOS)])\n",
+    "\n",
+    "    # 格式化某些列的值\n",
+    "    comma_separated_columns = {\n",
+    "        col_name: \"{:,.0f}\" for col_name in [\"Issues\", \"Links\", \"Changes\", \"Comments\"]\n",
+    "    }\n",
+    "\n",
+    "    # 展示DataFrame\n",
+    "    display(\n",
+    "        ecos_df.style.set_table_styles(\n",
+    "            [dict(selector=\"th\", props=[(\"text-align\", \"left\")])]\n",
+    "        ).format(comma_separated_columns, precision=0)\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "7dfad2f7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style type=\"text/css\">\n",
+       "#T_9db15 th {\n",
+       "  text-align: left;\n",
+       "}\n",
+       "</style>\n",
+       "<table id=\"T_9db15\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th class=\"blank level0\" >&nbsp;</th>\n",
+       "      <th id=\"T_9db15_level0_col0\" class=\"col_heading level0 col0\" >Year</th>\n",
+       "      <th id=\"T_9db15_level0_col1\" class=\"col_heading level0 col1\" >Issues</th>\n",
+       "      <th id=\"T_9db15_level0_col2\" class=\"col_heading level0 col2\" >DIT</th>\n",
+       "      <th id=\"T_9db15_level0_col3\" class=\"col_heading level0 col3\" >UIT</th>\n",
+       "      <th id=\"T_9db15_level0_col4\" class=\"col_heading level0 col4\" >Links</th>\n",
+       "      <th id=\"T_9db15_level0_col5\" class=\"col_heading level0 col5\" >DLT</th>\n",
+       "      <th id=\"T_9db15_level0_col6\" class=\"col_heading level0 col6\" >ULT</th>\n",
+       "      <th id=\"T_9db15_level0_col7\" class=\"col_heading level0 col7\" >UP</th>\n",
+       "      <th id=\"T_9db15_level0_col8\" class=\"col_heading level0 col8\" >Changes</th>\n",
+       "      <th id=\"T_9db15_level0_col9\" class=\"col_heading level0 col9\" >Ch/I</th>\n",
+       "      <th id=\"T_9db15_level0_col10\" class=\"col_heading level0 col10\" >Comments</th>\n",
+       "      <th id=\"T_9db15_level0_col11\" class=\"col_heading level0 col11\" >Co/I</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th id=\"T_9db15_level0_row0\" class=\"row_heading level0 row0\" >RedHat</th>\n",
+       "      <td id=\"T_9db15_row0_col0\" class=\"data row0 col0\" >2001</td>\n",
+       "      <td id=\"T_9db15_row0_col1\" class=\"data row0 col1\" >502,297</td>\n",
+       "      <td id=\"T_9db15_row0_col2\" class=\"data row0 col2\" >79</td>\n",
+       "      <td id=\"T_9db15_row0_col3\" class=\"data row0 col3\" >64</td>\n",
+       "      <td id=\"T_9db15_row0_col4\" class=\"data row0 col4\" >268,935</td>\n",
+       "      <td id=\"T_9db15_row0_col5\" class=\"data row0 col5\" >11</td>\n",
+       "      <td id=\"T_9db15_row0_col6\" class=\"data row0 col6\" >11</td>\n",
+       "      <td id=\"T_9db15_row0_col7\" class=\"data row0 col7\" >807</td>\n",
+       "      <td id=\"T_9db15_row0_col8\" class=\"data row0 col8\" >7,197,717</td>\n",
+       "      <td id=\"T_9db15_row0_col9\" class=\"data row0 col9\" >14</td>\n",
+       "      <td id=\"T_9db15_row0_col10\" class=\"data row0 col10\" >1,115,471</td>\n",
+       "      <td id=\"T_9db15_row0_col11\" class=\"data row0 col11\" >2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_9db15_level0_row1\" class=\"row_heading level0 row1\" >Qt</th>\n",
+       "      <td id=\"T_9db15_row1_col0\" class=\"data row1 col0\" >2005</td>\n",
+       "      <td id=\"T_9db15_row1_col1\" class=\"data row1 col1\" >180,574</td>\n",
+       "      <td id=\"T_9db15_row1_col2\" class=\"data row1 col2\" >15</td>\n",
+       "      <td id=\"T_9db15_row1_col3\" class=\"data row1 col3\" >15</td>\n",
+       "      <td id=\"T_9db15_row1_col4\" class=\"data row1 col4\" >58,621</td>\n",
+       "      <td id=\"T_9db15_row1_col5\" class=\"data row1 col5\" >10</td>\n",
+       "      <td id=\"T_9db15_row1_col6\" class=\"data row1 col6\" >10</td>\n",
+       "      <td id=\"T_9db15_row1_col7\" class=\"data row1 col7\" >60</td>\n",
+       "      <td id=\"T_9db15_row1_col8\" class=\"data row1 col8\" >2,307,707</td>\n",
+       "      <td id=\"T_9db15_row1_col9\" class=\"data row1 col9\" >13</td>\n",
+       "      <td id=\"T_9db15_row1_col10\" class=\"data row1 col10\" >507,214</td>\n",
+       "      <td id=\"T_9db15_row1_col11\" class=\"data row1 col11\" >3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_9db15_level0_row2\" class=\"row_heading level0 row2\" >Sum</th>\n",
+       "      <td id=\"T_9db15_row2_col0\" class=\"data row2 col0\" >nan</td>\n",
+       "      <td id=\"T_9db15_row2_col1\" class=\"data row2 col1\" >682,871</td>\n",
+       "      <td id=\"T_9db15_row2_col2\" class=\"data row2 col2\" >94</td>\n",
+       "      <td id=\"T_9db15_row2_col3\" class=\"data row2 col3\" >79</td>\n",
+       "      <td id=\"T_9db15_row2_col4\" class=\"data row2 col4\" >327,556</td>\n",
+       "      <td id=\"T_9db15_row2_col5\" class=\"data row2 col5\" >21</td>\n",
+       "      <td id=\"T_9db15_row2_col6\" class=\"data row2 col6\" >21</td>\n",
+       "      <td id=\"T_9db15_row2_col7\" class=\"data row2 col7\" >867</td>\n",
+       "      <td id=\"T_9db15_row2_col8\" class=\"data row2 col8\" >9,505,424</td>\n",
+       "      <td id=\"T_9db15_row2_col9\" class=\"data row2 col9\" >27</td>\n",
+       "      <td id=\"T_9db15_row2_col10\" class=\"data row2 col10\" >1,622,685</td>\n",
+       "      <td id=\"T_9db15_row2_col11\" class=\"data row2 col11\" >5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_9db15_level0_row3\" class=\"row_heading level0 row3\" >Mean</th>\n",
+       "      <td id=\"T_9db15_row3_col0\" class=\"data row3 col0\" >nan</td>\n",
+       "      <td id=\"T_9db15_row3_col1\" class=\"data row3 col1\" >341,436</td>\n",
+       "      <td id=\"T_9db15_row3_col2\" class=\"data row3 col2\" >47</td>\n",
+       "      <td id=\"T_9db15_row3_col3\" class=\"data row3 col3\" >40</td>\n",
+       "      <td id=\"T_9db15_row3_col4\" class=\"data row3 col4\" >163,778</td>\n",
+       "      <td id=\"T_9db15_row3_col5\" class=\"data row3 col5\" >10</td>\n",
+       "      <td id=\"T_9db15_row3_col6\" class=\"data row3 col6\" >10</td>\n",
+       "      <td id=\"T_9db15_row3_col7\" class=\"data row3 col7\" >434</td>\n",
+       "      <td id=\"T_9db15_row3_col8\" class=\"data row3 col8\" >4,752,712</td>\n",
+       "      <td id=\"T_9db15_row3_col9\" class=\"data row3 col9\" >14</td>\n",
+       "      <td id=\"T_9db15_row3_col10\" class=\"data row3 col10\" >811,342</td>\n",
+       "      <td id=\"T_9db15_row3_col11\" class=\"data row3 col11\" >2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_9db15_level0_row4\" class=\"row_heading level0 row4\" >Median</th>\n",
+       "      <td id=\"T_9db15_row4_col0\" class=\"data row4 col0\" >nan</td>\n",
+       "      <td id=\"T_9db15_row4_col1\" class=\"data row4 col1\" >341,436</td>\n",
+       "      <td id=\"T_9db15_row4_col2\" class=\"data row4 col2\" >47</td>\n",
+       "      <td id=\"T_9db15_row4_col3\" class=\"data row4 col3\" >40</td>\n",
+       "      <td id=\"T_9db15_row4_col4\" class=\"data row4 col4\" >163,778</td>\n",
+       "      <td id=\"T_9db15_row4_col5\" class=\"data row4 col5\" >10</td>\n",
+       "      <td id=\"T_9db15_row4_col6\" class=\"data row4 col6\" >10</td>\n",
+       "      <td id=\"T_9db15_row4_col7\" class=\"data row4 col7\" >434</td>\n",
+       "      <td id=\"T_9db15_row4_col8\" class=\"data row4 col8\" >4,752,712</td>\n",
+       "      <td id=\"T_9db15_row4_col9\" class=\"data row4 col9\" >14</td>\n",
+       "      <td id=\"T_9db15_row4_col10\" class=\"data row4 col10\" >811,342</td>\n",
+       "      <td id=\"T_9db15_row4_col11\" class=\"data row4 col11\" >2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_9db15_level0_row5\" class=\"row_heading level0 row5\" >Std Dev</th>\n",
+       "      <td id=\"T_9db15_row5_col0\" class=\"data row5 col0\" >nan</td>\n",
+       "      <td id=\"T_9db15_row5_col1\" class=\"data row5 col1\" >160,862</td>\n",
+       "      <td id=\"T_9db15_row5_col2\" class=\"data row5 col2\" >32</td>\n",
+       "      <td id=\"T_9db15_row5_col3\" class=\"data row5 col3\" >24</td>\n",
+       "      <td id=\"T_9db15_row5_col4\" class=\"data row5 col4\" >105,157</td>\n",
+       "      <td id=\"T_9db15_row5_col5\" class=\"data row5 col5\" >0</td>\n",
+       "      <td id=\"T_9db15_row5_col6\" class=\"data row5 col6\" >0</td>\n",
+       "      <td id=\"T_9db15_row5_col7\" class=\"data row5 col7\" >374</td>\n",
+       "      <td id=\"T_9db15_row5_col8\" class=\"data row5 col8\" >2,445,005</td>\n",
+       "      <td id=\"T_9db15_row5_col9\" class=\"data row5 col9\" >0</td>\n",
+       "      <td id=\"T_9db15_row5_col10\" class=\"data row5 col10\" >304,128</td>\n",
+       "      <td id=\"T_9db15_row5_col11\" class=\"data row5 col11\" >0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n"
+      ],
+      "text/plain": [
+       "<pandas.io.formats.style.Styler at 0x7f2c251df1f0>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "display_ecos_df(ecos_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "63163049",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ecos_df.to_csv(\"../data/ecos_overview.csv\", sep=\";\", encoding=\"utf-8\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  },
+  "toc-autonumbering": false,
+  "toc-showcode": false,
+  "toc-showmarkdowntxt": false,
+  "toc-showtags": false
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/1_empirical_analysis/data/rq1/comment_scale.csv
+++ b/1_empirical_analysis/data/rq1/comment_scale.csv
@ -0,0 +1,5 @@
+ecosystem,scope,max,min,median,mean,std
+RedHat,without,421,0,1.0,2.1489339812279398,3.5470614635577187
+RedHat,with,117,0,1.0,2.2934478185438794,3.9874327658983035
+RedHat,in,117,0,1.0,2.268535068928034,4.001612675816817
+RedHat,cross,117,0,2.0,3.1102165516976346,4.951732355023365
--- a/1_empirical_analysis/data/rq1/comment_scale_gen.csv
+++ b/1_empirical_analysis/data/rq1/comment_scale_gen.csv
@ -0,0 +1,5 @@
+ecosystem,scope,max,min,median,mean,std
+RedHat,without,421,0,1.0,1.8907019826905265,3.290788730118782
+RedHat,with,117,0,2.0,2.993302339192955,4.6213686922437915
+RedHat,in,117,0,2.0,3.0957131121431085,4.758508197318559
+RedHat,cross,117,0,2.0,3.1515352998065764,4.9921727596311385
--- a/1_empirical_analysis/data/rq1/cti_dist.pdf
+++ b/1_empirical_analysis/data/rq1/cti_dist.pdf
--- a/1_empirical_analysis/data/rq1/cti_dist_gen.pdf
+++ b/1_empirical_analysis/data/rq1/cti_dist_gen.pdf
--- a/1_empirical_analysis/data/rq1/eco_overview.csv
+++ b/1_empirical_analysis/data/rq1/eco_overview.csv
@ -0,0 +1,2 @@
+ecosystem,#issues,#issues_with_links,#links,%issues_with_links,#max_links,#min_links,#median_links,#mean_links,#link_types,#projects,#links_cross_project,%links_cross_project
+RedHat,502297,249581,238053,49.69,399,1,1.0,1.9076211730860924,13,279,34866,14.65
--- a/1_empirical_analysis/data/rq1/link_count_dist.pdf
+++ b/1_empirical_analysis/data/rq1/link_count_dist.pdf
--- a/1_empirical_analysis/data/rq1/lti_dist.pdf
+++ b/1_empirical_analysis/data/rq1/lti_dist.pdf
--- a/1_empirical_analysis/data/rq1/lti_dist_gen.pdf
+++ b/1_empirical_analysis/data/rq1/lti_dist_gen.pdf
--- a/1_empirical_analysis/data/rq1/num_comments_in_cross_dist.pdf
+++ b/1_empirical_analysis/data/rq1/num_comments_in_cross_dist.pdf
--- a/1_empirical_analysis/data/rq1/num_comments_in_cross_dist_gen.pdf
+++ b/1_empirical_analysis/data/rq1/num_comments_in_cross_dist_gen.pdf
--- a/1_empirical_analysis/data/rq1/num_comments_without_with_dist.pdf
+++ b/1_empirical_analysis/data/rq1/num_comments_without_with_dist.pdf
--- a/1_empirical_analysis/data/rq1/num_comments_without_with_dist_gen.pdf
+++ b/1_empirical_analysis/data/rq1/num_comments_without_with_dist_gen.pdf
--- a/1_empirical_analysis/data/rq1/sloved_proportion.csv
+++ b/1_empirical_analysis/data/rq1/sloved_proportion.csv
@ -0,0 +1,5 @@
+ecosystem,scope,num_issues,num_not_closed,per_not_closed,num_closed,per_closed
+RedHat,without,252716,80693,31.93,172023,68.07
+RedHat,with,249581,81104,32.5,168477,67.5
+RedHat,in,219867,69418,31.57,150449,68.43
+RedHat,cross,52043,20816,40.0,31227,60.0
--- a/1_empirical_analysis/data/rq1/sloved_proportion_gen.csv
+++ b/1_empirical_analysis/data/rq1/sloved_proportion_gen.csv
@ -0,0 +1,5 @@
+ecosystem,scope,num_issues,num_not_closed,per_not_closed,num_closed,per_closed
+RedHat,without,351946,112787,32.05,239159,67.95
+RedHat,with,150351,49010,32.6,101341,67.4
+RedHat,in,119294,36870,30.91,82424,69.09
+RedHat,cross,49632,20120,40.54,29512,59.46
--- a/1_empirical_analysis/data/rq1/sti.csv
+++ b/1_empirical_analysis/data/rq1/sti.csv
@ -0,0 +1,5 @@
+ecosystem,scope,max,min,median,mean,std
+RedHat,without,6412,0,103,460,856
+RedHat,with,6283,0,113,366,660
+RedHat,in,6283,0,104,340,639
+RedHat,cross,5815,0,219,531,736
--- a/1_empirical_analysis/data/rq1/sti_gen.csv
+++ b/1_empirical_analysis/data/rq1/sti_gen.csv
@ -0,0 +1,5 @@
+ecosystem,scope,max,min,median,mean,std
+RedHat,without,6412,0,95,405,785
+RedHat,with,6283,0,140,432,720
+RedHat,in,6283,0,124,399,701
+RedHat,cross,5815,0,230,548,746
--- a/1_empirical_analysis/data/rq1/sti_in_cross_dist.pdf
+++ b/1_empirical_analysis/data/rq1/sti_in_cross_dist.pdf
--- a/1_empirical_analysis/data/rq1/sti_in_cross_dist_gen.pdf
+++ b/1_empirical_analysis/data/rq1/sti_in_cross_dist_gen.pdf
--- a/1_empirical_analysis/data/rq1/sti_without_with_dist.pdf
+++ b/1_empirical_analysis/data/rq1/sti_without_with_dist.pdf
--- a/1_empirical_analysis/data/rq1/sti_without_with_dist_gen.pdf
+++ b/1_empirical_analysis/data/rq1/sti_without_with_dist_gen.pdf
--- a/1_empirical_analysis/data/rq1/time_interval.csv
+++ b/1_empirical_analysis/data/rq1/time_interval.csv
@ -0,0 +1,7 @@
+ecosystem,scope,interval_type,max,min,median,mean,std
+RedHat,with,cti,5372,0,24,103,218
+RedHat,with,lti,5829,0,0,207,729
+RedHat,in,cti,5372,0,22,99,210
+RedHat,in,lti,5829,0,0,180,691
+RedHat,cross,cti,4815,0,33,132,268
+RedHat,cross,lti,5766,0,0,408,950
--- a/1_empirical_analysis/data/rq1/time_interval_gen.csv
+++ b/1_empirical_analysis/data/rq1/time_interval_gen.csv
@ -0,0 +1,7 @@
+ecosystem,scope,interval_type,max,min,median,mean,std
+RedHat,with,cti,5372,0,28,120,256
+RedHat,with,lti,5829,0,0,399,979
+RedHat,in,cti,5372,0,28,118,251
+RedHat,in,lti,5829,0,0,388,978
+RedHat,cross,cti,4815,0,28,128,271
+RedHat,cross,lti,5766,0,0,443,984
--- a/1_empirical_analysis/data/rq1/top30_issues_with_links.pdf
+++ b/1_empirical_analysis/data/rq1/top30_issues_with_links.pdf
--- a/1_empirical_analysis/data/rq2/comment_scale.csv
+++ b/1_empirical_analysis/data/rq2/comment_scale.csv
@ -0,0 +1,14 @@
+ecosystem,link_type,min,q1,median,q3,max,mean,std
+RedHat,Account,0,0.0,2.0,5.0,87,3.9,6.5
+RedHat,Blocks,0,1.0,2.0,4.0,117,3.5,5.5
+RedHat,Causality,0,0.0,2.0,4.0,100,3.6,5.7
+RedHat,Cloners,0,0.0,1.0,3.0,100,2.4,3.9
+RedHat,Depend,0,0.0,1.0,3.0,91,3.0,5.0
+RedHat,Document,0,0.0,2.0,5.0,63,3.6,5.3
+RedHat,Duplicate,0,1.0,2.0,4.0,117,3.4,5.2
+RedHat,Epic,0,0.0,1.0,2.0,80,1.7,3.3
+RedHat,Incorporates,0,1.0,1.0,3.0,83,2.6,4.0
+RedHat,Issue split,0,0.0,1.0,3.0,77,2.5,5.0
+RedHat,Related,0,1.0,2.0,5.0,117,3.9,5.6
+RedHat,Subtask,0,0.0,1.0,2.0,117,1.6,3.2
+RedHat,Triggers,0,0.0,2.0,4.0,62,3.4,5.6
--- a/1_empirical_analysis/data/rq2/cti_dist.pdf
+++ b/1_empirical_analysis/data/rq2/cti_dist.pdf
--- a/1_empirical_analysis/data/rq2/link_types_dist.pdf
+++ b/1_empirical_analysis/data/rq2/link_types_dist.pdf
--- a/1_empirical_analysis/data/rq2/link_types_dist_pie.pdf
+++ b/1_empirical_analysis/data/rq2/link_types_dist_pie.pdf
--- a/1_empirical_analysis/data/rq2/link_types_overview.csv
+++ b/1_empirical_analysis/data/rq2/link_types_overview.csv
@ -0,0 +1,14 @@
+ecosystem,link_type,number,percentage,num_in,per_in,num_cross,per_cross
+RedHat,Epic,67799,28.48,65360,96.4,2439,3.6
+RedHat,Subtask,45020,18.91,45013,99.98,7,0.02
+RedHat,Related,44222,18.58,34481,77.97,9741,22.03
+RedHat,Cloners,29629,12.45,19142,64.61,10487,35.39
+RedHat,Blocks,21106,8.87,16367,77.55,4739,22.45
+RedHat,Incorporates,12847,5.4,9154,71.25,3693,28.75
+RedHat,Duplicate,7080,2.97,6414,90.59,666,9.41
+RedHat,Causality,4122,1.73,2714,65.84,1408,34.16
+RedHat,Depend,2849,1.2,2311,81.12,538,18.88
+RedHat,Document,1652,0.69,811,49.09,841,50.91
+RedHat,Issue split,694,0.29,665,95.82,29,4.18
+RedHat,Account,568,0.24,462,81.34,106,18.66
+RedHat,Triggers,465,0.2,293,63.01,172,36.99
--- a/1_empirical_analysis/data/rq2/lti_dist.pdf
+++ b/1_empirical_analysis/data/rq2/lti_dist.pdf
--- a/1_empirical_analysis/data/rq2/num_comments_dist.pdf
+++ b/1_empirical_analysis/data/rq2/num_comments_dist.pdf
--- a/1_empirical_analysis/data/rq2/solved_proportion.csv
+++ b/1_empirical_analysis/data/rq2/solved_proportion.csv
@ -0,0 +1,14 @@
+ecosystem,link_type,num_issues,num_not_closed,per_not_closed,num_closed,per_closed
+RedHat,Account,886,246,27.77,640,72.23
+RedHat,Blocks,29857,9117,30.54,20740,69.46
+RedHat,Causality,6920,2563,37.04,4357,62.96
+RedHat,Cloners,48828,18147,37.17,30681,62.83
+RedHat,Depend,4247,1647,38.78,2600,61.22
+RedHat,Document,2795,672,24.04,2123,75.96
+RedHat,Duplicate,13104,2646,20.19,10458,79.81
+RedHat,Epic,77019,26516,34.43,50503,65.57
+RedHat,Incorporates,16505,8090,49.02,8415,50.98
+RedHat,Issue split,1071,467,43.6,604,56.4
+RedHat,Related,63125,19934,31.58,43191,68.42
+RedHat,Subtask,55986,15859,28.33,40127,71.67
+RedHat,Triggers,779,284,36.46,495,63.54
--- a/1_empirical_analysis/data/rq2/sti.csv
+++ b/1_empirical_analysis/data/rq2/sti.csv
@ -0,0 +1,14 @@
+ecosystem,link_type,min,q1,median,q3,max,mean,std
+RedHat,Account,0.0,16.9,62.9,169.7,1189.9,129.8,169.4
+RedHat,Blocks,0.0,48.6,174.1,552.9,6283.1,468.0,722.8
+RedHat,Causality,0.0,40.2,167.9,608.2,3832.2,440.4,587.5
+RedHat,Cloners,0.0,29.1,126.6,381.8,4714.3,342.2,532.4
+RedHat,Depend,0.0,23.1,68.1,155.0,2052.2,115.4,145.9
+RedHat,Document,0.0,55.0,132.9,326.5,3792.5,253.9,337.0
+RedHat,Duplicate,0.0,16.2,104.0,444.4,5946.3,434.5,804.4
+RedHat,Epic,0.0,27.8,99.3,294.9,4253.5,233.3,329.4
+RedHat,Incorporates,0.0,80.1,213.4,575.2,5900.5,486.7,684.8
+RedHat,Issue split,0.0,21.0,52.9,156.3,1173.3,118.5,161.8
+RedHat,Related,0.0,45.1,173.7,594.8,6111.5,511.0,814.9
+RedHat,Subtask,0.0,20.0,77.0,295.0,5899.6,325.7,702.4
+RedHat,Triggers,0.0,18.3,63.0,197.0,1487.8,152.2,212.5
--- a/1_empirical_analysis/data/rq2/sti_dist.pdf
+++ b/1_empirical_analysis/data/rq2/sti_dist.pdf
--- a/1_empirical_analysis/data/rq2/time_interval.csv
+++ b/1_empirical_analysis/data/rq2/time_interval.csv
@ -0,0 +1,27 @@
+ecosystem,link_type,interval_type,min,q1,median,q3,max,mean,std
+RedHat,Account,cti,0.0,7.5,28.3,85.4,2004.1,87.4,191.1
+RedHat,Account,lti,0.0,0.0,0.0,3.4,371.7,12.3,41.8
+RedHat,Blocks,cti,0.0,1.1,20.8,91.1,4020.7,93.5,199.4
+RedHat,Blocks,lti,0.0,2.1,1119.1,2767.7,5829.9,1541.8,1622.6
+RedHat,Causality,cti,0.0,7.3,43.9,149.4,3948.3,155.4,309.5
+RedHat,Causality,lti,0.0,0.0,359.4,1160.0,2693.0,600.7,652.4
+RedHat,Cloners,cti,0.0,0.0,13.4,71.2,2884.2,71.5,171.0
+RedHat,Cloners,lti,0.0,0.0,0.0,0.0,5546.1,137.0,661.2
+RedHat,Depend,cti,0.0,0.0,17.1,86.0,2884.0,79.8,180.4
+RedHat,Depend,lti,0.0,0.0,0.0,4.4,874.4,13.9,46.8
+RedHat,Document,cti,0.0,13.0,53.7,121.0,2700.0,119.3,221.6
+RedHat,Document,lti,0.0,0.0,0.0,5.8,997.2,17.3,62.6
+RedHat,Duplicate,cti,0.0,5.7,33.0,140.8,3196.8,138.3,275.9
+RedHat,Duplicate,lti,0.0,0.2,3.9,34.2,2638.1,50.5,144.9
+RedHat,Epic,cti,0.0,1.5,39.4,129.4,4159.7,103.3,169.9
+RedHat,Epic,lti,0.0,0.0,0.0,0.0,1444.8,6.4,35.3
+RedHat,Incorporates,cti,0.0,5.1,26.8,96.3,4179.5,106.5,238.7
+RedHat,Incorporates,lti,0.0,0.0,0.0,6.9,3161.8,19.8,95.8
+RedHat,Issue split,cti,0.0,14.0,28.0,92.8,2128.7,87.2,163.3
+RedHat,Issue split,lti,0.0,0.0,0.0,107.6,1163.8,102.2,200.3
+RedHat,Related,cti,0.0,7.4,46.8,165.6,5372.6,159.9,309.3
+RedHat,Related,lti,0.0,0.0,0.1,36.2,3677.8,181.5,478.4
+RedHat,Subtask,cti,0.0,0.0,1.6,46.2,3593.7,60.1,159.3
+RedHat,Subtask,lti,0.0,0.0,0.0,0.0,3677.8,1.3,27.4
+RedHat,Triggers,cti,0.0,11.3,32.2,124.0,1173.6,100.4,167.2
+RedHat,Triggers,lti,0.0,0.0,0.0,0.2,2599.0,17.6,134.1
--- a/1_empirical_analysis/scripts/data_extract.ipynb
+++ b/1_empirical_analysis/scripts/data_extract.ipynb
@ -0,0 +1,360 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "from time import time\n",
+    "from tqdm import tqdm\n",
+    "from pathlib import Path\n",
+    "from pymongo import MongoClient\n",
+    "from pymongo.database import Database"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 软件生态名\n",
+    "ECO_NAMES = [\n",
+    "    # \"Apache\",\n",
+    "    # \"Jira\",\n",
+    "    # \"Mojang\",\n",
+    "    # \"MongoDB\",\n",
+    "    # \"Qt\",\n",
+    "    \"RedHat\",\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 保存Issue和链接原始数据的目录\n",
+    "ISSUE_DIR = Path(\"../data/raw/issues\")\n",
+    "ISSUE_DIR.mkdir(parents=True, exist_ok=True)\n",
+    "LINK_DIR = Path(\"../data/raw/links\")\n",
+    "LINK_DIR.mkdir(parents=True, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 格式化时间间隔\n",
+    "def format_duration(start_time, end_time):\n",
+    "    # 计算总秒数\n",
+    "    seconds = end_time - start_time\n",
+    "    # 计算分钟和小时数\n",
+    "    minutes = int(seconds / 60)\n",
+    "    hours = int(minutes / 60)\n",
+    "    display_minutes = int(minutes % 60)\n",
+    "    display_seconds = int(seconds % 60)\n",
+    "\n",
+    "    return f\"{hours:02}:{display_minutes:02}:{display_seconds:02}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_issues_to_csv(eco_name: str, db: Database):\n",
+    "    # 提取生态中的Issue字段数据，保存至csv文件\n",
+    "    issues_collection = db[eco_name]\n",
+    "    # 迭代访问所有文档\n",
+    "    cursor = issues_collection.find({})\n",
+    "\n",
+    "    issues = []\n",
+    "    # 分别记录提取的Issue和comments总数\n",
+    "    num_comments = 0\n",
+    "\n",
+    "    for issue in tqdm(cursor):\n",
+    "        try:\n",
+    "            key = issue[\"key\"]  # Issue关键字\n",
+    "\n",
+    "            project_key = issue[\"fields\"][\"project\"][\"key\"]  # 所属项目关键字\n",
+    "\n",
+    "            project_name = issue[\"fields\"][\"project\"][\"name\"]  # 所属项目名\n",
+    "\n",
+    "            try:\n",
+    "                issue_type = issue[\"fields\"][\"issuetype\"][\"name\"]  # Issue类型\n",
+    "            except Exception:\n",
+    "                issue_type = None\n",
+    "\n",
+    "            try:\n",
+    "                status = issue[\"fields\"][\"status\"][\"name\"]  # 状态\n",
+    "            except Exception:\n",
+    "                status = None\n",
+    "\n",
+    "            try:\n",
+    "                resolution = issue[\"fields\"][\"resolution\"][\"name\"]  # 解决与否\n",
+    "            except Exception:\n",
+    "                resolution = None\n",
+    "\n",
+    "            try:\n",
+    "                created_time = issue[\"fields\"][\"created\"]  # 创建时间\n",
+    "            except Exception:\n",
+    "                created_time = None\n",
+    "\n",
+    "            try:\n",
+    "                priority = issue[\"fields\"][\"priority\"][\"name\"]  # 优先级\n",
+    "            except Exception:\n",
+    "                priority = None\n",
+    "\n",
+    "            try:\n",
+    "                title = issue[\"fields\"][\"summary\"]  # 标题\n",
+    "            except Exception:\n",
+    "                title = None\n",
+    "\n",
+    "            try:\n",
+    "                description = issue[\"fields\"][\"description\"]  # 描述\n",
+    "            except Exception:\n",
+    "                description = None\n",
+    "\n",
+    "            num_issue_comments = issue[\"fields\"][\"comment\"][\n",
+    "                \"total\"\n",
+    "            ]  # 该Issue的评论数量\n",
+    "\n",
+    "            issue_dict = {\n",
+    "                \"key\": key,\n",
+    "                \"project_key\": project_key,\n",
+    "                \"project_name\": project_name,\n",
+    "                \"issue_type\": issue_type,\n",
+    "                \"status\": status,\n",
+    "                \"resolution\": resolution,\n",
+    "                \"created_time\": created_time,\n",
+    "                \"priority\": priority,\n",
+    "                \"title\": title,\n",
+    "                \"description\": description,\n",
+    "                \"num_comments\": num_issue_comments,\n",
+    "            }\n",
+    "\n",
+    "            issues.append(issue_dict)\n",
+    "            num_comments += num_issue_comments\n",
+    "\n",
+    "        except Exception:\n",
+    "            pass\n",
+    "\n",
+    "    filename = ISSUE_DIR / (eco_name + \".csv\")\n",
+    "    with open(filename, \"w\", errors=\"surrogatepass\", encoding=\"utf-8\") as output_file:\n",
+    "        dict_wirter = csv.DictWriter(output_file, issues[0].keys(), delimiter=\";\")\n",
+    "        dict_wirter.writeheader()\n",
+    "        dict_wirter.writerows(issues)\n",
+    "\n",
+    "    print(\n",
+    "        f\"✔ Extracted {len(issues)} raw issues with {num_comments} comments from {eco_name}\"\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Epic链接自定义字段\n",
+    "EPICLINK_FIELD_DICT = {\n",
+    "    \"Apache\": \"customfield_12311120\",\n",
+    "    \"Jira\": \"customfield_12931\",\n",
+    "    \"Mojang\": \"customfield_11602\",\n",
+    "    \"MongoDB\": \"customfield_10857\",\n",
+    "    \"Qt\": \"customfield_10400\",\n",
+    "    \"RedHat\": \"customfield_12311140\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_links_to_csv(eco_name: str, db: Database):\n",
+    "    # 提取生态中的链接数据，保存至csv文件\n",
+    "\n",
+    "    issues_collection = db[eco_name]\n",
+    "\n",
+    "    links = []\n",
+    "    cursor = issues_collection.find({})\n",
+    "    for issue in tqdm(cursor):\n",
+    "        try:\n",
+    "            key = issue[\"key\"]\n",
+    "\n",
+    "            # 提取一般类型链接\n",
+    "            issuelinks = issue[\"fields\"][\"issuelinks\"]\n",
+    "            for issuelink in issuelinks:\n",
+    "                link_type = issuelink[\"type\"][\"name\"]\n",
+    "\n",
+    "                try:\n",
+    "                    in_issue_key = key\n",
+    "                    out_issue_key = issuelink[\"outwardIssue\"][\"key\"]\n",
+    "                except Exception:\n",
+    "                    in_issue_key = issuelink[\"inwardIssue\"][\"key\"]\n",
+    "                    out_issue_key = key\n",
+    "\n",
+    "                link_key = in_issue_key + \"_\" + out_issue_key  # 用作链接的唯一标识\n",
+    "\n",
+    "                link_dict = {\n",
+    "                    \"link_key\": link_key,\n",
+    "                    \"link_type\": link_type,\n",
+    "                    \"in_issue_key\": in_issue_key,\n",
+    "                    \"out_issue_key\": out_issue_key,\n",
+    "                }\n",
+    "\n",
+    "                links.append(link_dict)\n",
+    "\n",
+    "            # 提取Subtask链接\n",
+    "            subtasks = issue[\"fields\"][\"subtasks\"]\n",
+    "            for subtask in subtasks:\n",
+    "\n",
+    "                link_type = \"Subtask\"\n",
+    "                in_issue_key = key\n",
+    "                out_issue_key = subtask[\n",
+    "                    \"key\"\n",
+    "                ]  # Subtask类型链接都是由父Issue指向子Issue\n",
+    "\n",
+    "                link_key = in_issue_key + \"_\" + out_issue_key\n",
+    "\n",
+    "                link_dict = {\n",
+    "                    \"link_key\": link_key,\n",
+    "                    \"link_type\": link_type,\n",
+    "                    \"in_issue_key\": in_issue_key,\n",
+    "                    \"out_issue_key\": out_issue_key,\n",
+    "                }\n",
+    "\n",
+    "                links.append(link_dict)\n",
+    "\n",
+    "            # 提取Epic链接\n",
+    "            try:\n",
+    "                epic_key = issue[\"fields\"][EPICLINK_FIELD_DICT[eco_name]]\n",
+    "                in_issue_key = key\n",
+    "                out_issue_key = epic_key\n",
+    "                link_key = in_issue_key + \"_\" + out_issue_key\n",
+    "                link_type = \"Epic\"\n",
+    "\n",
+    "                link_dict = {\n",
+    "                    \"link_key\": link_key,\n",
+    "                    \"link_type\": link_type,\n",
+    "                    \"in_issue_key\": in_issue_key,\n",
+    "                    \"out_issue_key\": out_issue_key,\n",
+    "                }\n",
+    "\n",
+    "                links.append(link_dict)\n",
+    "\n",
+    "            except Exception:\n",
+    "                pass\n",
+    "\n",
+    "        except Exception:\n",
+    "            pass\n",
+    "\n",
+    "    filename = LINK_DIR / (eco_name + \".csv\")\n",
+    "    with open(filename, \"w\", errors=\"surrogatepass\", encoding=\"utf-8\") as output_file:\n",
+    "        dict_wirter = csv.DictWriter(output_file, links[0].keys(), delimiter=\";\")\n",
+    "        dict_wirter.writeheader()\n",
+    "        dict_wirter.writerows(links)\n",
+    "\n",
+    "    print(f\"✔ Extracted {len(links)} raw links from {eco_name}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Extracting issues and links data from database...\n",
+      "Working on ecosystem: RedHat ...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "502297it [00:43, 11442.78it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✔ Extracted 502297 raw issues with 1115471 comments from RedHat\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "502297it [00:38, 13038.55it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✔ Extracted 405070 raw links from RedHat\n",
+      "✔ RedHat completely processed. Duration: 00:01:31\n",
+      "\n",
+      "✅ All completely processed. Total duration: 00:01:31\n"
+     ]
+    }
+   ],
+   "source": [
+    "with MongoClient() as client:\n",
+    "    start_time = time()  # 记录总处理时间\n",
+    "    db = client[\"JiraEcos\"]\n",
+    "    print(\"Extracting issues and links data from database...\")\n",
+    "\n",
+    "    for eco_name in ECO_NAMES:\n",
+    "        eco_start_time = time()  # 记录提取每个生态数据的时间\n",
+    "        print(f\"Working on ecosystem: {eco_name} ...\")\n",
+    "\n",
+    "        extract_issues_to_csv(eco_name, db)\n",
+    "        extract_links_to_csv(eco_name, db)\n",
+    "\n",
+    "        print(\n",
+    "            f\"✔ {eco_name} completely processed. Duration: {format_duration(eco_start_time, time())}\"\n",
+    "        )\n",
+    "        print(\"\")\n",
+    "\n",
+    "    print(\n",
+    "        f\"✅ All completely processed. Total duration: {format_duration(start_time, time())}\"\n",
+    "    )"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "grad_pro_env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/1_empirical_analysis/scripts/data_preprocess.ipynb
+++ b/1_empirical_analysis/scripts/data_preprocess.ipynb
@ -0,0 +1,589 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "1345f1c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm\n",
+    "from pathlib import Path\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from pymongo import MongoClient"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "46d49dd9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 软件生态名\n",
+    "ECO_NAMES = [\n",
+    "    # \"Apache\",\n",
+    "    # \"Jira\",\n",
+    "    # \"Mojang\",\n",
+    "    # \"MongoDB\",\n",
+    "    # \"Qt\",\n",
+    "    \"RedHat\",\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "03cf285e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ISSUE_DIR = Path(\"../data/raw/issues\")\n",
+    "LINK_DIR = Path(\"../data/raw/links\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "61e8f96a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "PRO_ISSUE_DIR = Path(\"../data/processed/issues\")\n",
+    "PRO_ISSUE_DIR.mkdir(parents=True, exist_ok=True)\n",
+    "PRO_LINK_DIR = Path(\"../data/processed/links\")\n",
+    "PRO_LINK_DIR.mkdir(parents=True, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "40885839",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_issues(eco_name: str):\n",
+    "    # 加载Issue数据DataFrame\n",
+    "\n",
+    "    filename = ISSUE_DIR / (eco_name + \".csv\")\n",
+    "    issue_df = pd.read_csv(\n",
+    "        filename, sep=\";\", encoding=\"utf-8\", low_memory=False, index_col=[\"key\"]\n",
+    "    )\n",
+    "    return issue_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "1981c072",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_links(eco_name: str):\n",
+    "    # 加载链接数据DataFrame\n",
+    "\n",
+    "    filename = LINK_DIR / (eco_name + \".csv\")\n",
+    "    link_df = pd.read_csv(filename, sep=\";\", encoding=\"utf-8\", low_memory=False)\n",
+    "    return link_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "865088c5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def clean_issues(issue_df: pd.DataFrame):\n",
+    "    # 对Issue数据进行清洗\n",
+    "\n",
+    "    # 把时间数据转换为统一格式\n",
+    "    issue_df[\"created_time\"] = pd.to_datetime(\n",
+    "        issue_df[\"created_time\"], errors=\"coerce\"\n",
+    "    ).apply(lambda x: x.strftime(\"%Y-%m-%d %H:%M:%S\") if pd.notna(x) else np.nan)\n",
+    "\n",
+    "    return issue_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "4447a7d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def clean_links(link_df: pd.DataFrame, issue_df: pd.DataFrame):\n",
+    "    # 对链接数据进行清洗\n",
+    "\n",
+    "    def column_transform(row):\n",
+    "        return str(sorted(set([row[\"in_issue_key\"], row[\"out_issue_key\"]])))\n",
+    "\n",
+    "    # 一条（一般类型）链接会在两个Issue的字段中存在，需要清除其中一个副本\n",
+    "    link_df.drop_duplicates(inplace=True)\n",
+    "    print(f\"Left with {len(link_df)} links after removing link duplication\")\n",
+    "\n",
+    "    # 清除Issue是私有的、无权访问的链接\n",
+    "    condition = (\n",
+    "        link_df[[\"in_issue_key\", \"out_issue_key\"]]\n",
+    "        .isin(issue_df.index.values)\n",
+    "        .all(axis=1)\n",
+    "    )\n",
+    "    link_df = link_df[condition]\n",
+    "    print(f\"Left with {len(link_df)} links after removing half-private issues\")\n",
+    "\n",
+    "    # 一对Issue间只允许存在一条链接，需要删除含有多条链接的Issue对\n",
+    "    # 首先基于'link_key'字段删除重复的Issue对\n",
+    "    # !注意：相同的'link_key'的Issue对之间是可能存在多种类型的链接，这会混淆关联关系，所以全部清除\n",
+    "    link_df.drop_duplicates(subset=[\"link_key\"], keep=False, inplace=True)\n",
+    "\n",
+    "    # 其次，以防'link_key'是反过来的，比如issue1_issue2和issue2_issue1\n",
+    "    # 所以添加'sorted_issue_keys'字段，由链接的两个Issue的key升序组成\n",
+    "    link_df[\"sorted_issue_keys\"] = link_df.apply(column_transform, axis=1)\n",
+    "    # 找出链接的两端Issue的keys相同的行对应的'sorted_issue_keys'字段值\n",
+    "    doublelinks = (\n",
+    "        (link_df[\"sorted_issue_keys\"].value_counts() > 1)\n",
+    "        .rename_axis(\"doubles\")\n",
+    "        .reset_index(name=\"valid\")\n",
+    "    )\n",
+    "    valid_double_keys = set(doublelinks[doublelinks[\"valid\"] == True][\"doubles\"])\n",
+    "\n",
+    "    # 把重复的'sorted_issue_keys'字段对应的链接类型取出来检查，若类型数大于1，则清除这些Issue对\n",
+    "    for i in tqdm(valid_double_keys):\n",
+    "        if len(set(link_df[link_df[\"sorted_issue_keys\"] == i][\"link_type\"])) > 1:\n",
+    "            condition = link_df[\"sorted_issue_keys\"] != i\n",
+    "            link_df = link_df[condition]\n",
+    "    print(\n",
+    "        f\"Left with {len(link_df)} links after removing issue-pairs with multiple types of links between them\"\n",
+    "    )\n",
+    "\n",
+    "    # 最后，留下来的链接中仍然可能有重复链接类型的Issue对（通过Issue的key对调的方式实现的），清除其中一个\n",
+    "    link_df.drop_duplicates(subset=[\"sorted_issue_keys\"], inplace=True)\n",
+    "    print(\n",
+    "        f\"Left with {len(link_df)} links after removing issue-pairs with duplicate same type of links\"\n",
+    "    )\n",
+    "\n",
+    "    link_df.reset_index(inplace=True, drop=True)\n",
+    "\n",
+    "    return link_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "ab8df842",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def joined_links(link_df: pd.DataFrame, issue_df: pd.DataFrame):\n",
+    "    # 联合Issue和链接数据\n",
+    "\n",
+    "    joined_df = link_df.join(issue_df.add_suffix(\"_in\"), on=\"in_issue_key\").join(\n",
+    "        issue_df.add_suffix(\"_out\"), on=\"out_issue_key\"\n",
+    "    )\n",
+    "\n",
+    "    # !注意：补充Subtask类型链接创建时间\n",
+    "    joined_df.loc[\n",
+    "        (joined_df[\"link_type\"] == \"Subtask\") & (joined_df[\"link_created_time\"].isna()),\n",
+    "        \"link_created_time\",\n",
+    "    ] = joined_df[\"created_time_out\"]\n",
+    "\n",
+    "    return joined_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "7a496e37",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def query_issue_closed_time(eco_name: str, issue_df: pd.DataFrame):\n",
+    "    # 查询Issue的history，获取Issue关闭时间\n",
+    "\n",
+    "    # 定义一个函数，用于处理每个分组\n",
+    "    def handle_group(group):\n",
+    "        # 如果分组内的closed_time全为NaN，则保留该分组的第一行\n",
+    "        if group[\"closed_time\"].isna().all():\n",
+    "            return group.iloc[0:1]\n",
+    "        # 否则，返回closed_time最大值对应的行\n",
+    "        else:\n",
+    "            return group.loc[[group[\"closed_time\"].idxmax()]]\n",
+    "\n",
+    "    # 把索引列转换为普通列，列名为key\n",
+    "    issue_df = issue_df.reset_index().rename(columns={\"index\": \"key\"})\n",
+    "    # 创建Issue关闭时间列\n",
+    "    issue_df[\"closed_time\"] = None\n",
+    "\n",
+    "    with MongoClient() as client:\n",
+    "        # 链接数据库\n",
+    "        db = client[\"JiraEcos\"]\n",
+    "        histories_collection = db[eco_name + \"Histories\"]\n",
+    "\n",
+    "        # 首先取出需要查询的Issue keys并移除重复项\n",
+    "        issue_keys = issue_df[\"key\"].unique().tolist()\n",
+    "\n",
+    "        # 构造聚合查询管道\n",
+    "        pipeline = [\n",
+    "            {\n",
+    "                # 第一步：筛选key在issue_keys列表中的文档\n",
+    "                \"$match\": {\"key\": {\"$in\": issue_keys}}\n",
+    "            },\n",
+    "            {\n",
+    "                # 第二步：展开history.items数组\n",
+    "                # 进而返回每个具体更改事件\n",
+    "                \"$unwind\": \"$history.items\"\n",
+    "            },\n",
+    "            {\n",
+    "                # 第三步：再次筛选满足特定field值的展开后的文档\n",
+    "                # field字段为status保证更改事件是修改Issue状态\n",
+    "                # toString字段为Closed保证是关闭Issue\n",
+    "                \"$match\": {\n",
+    "                    \"history.items.field\": \"status\",\n",
+    "                    \"history.items.toString\": \"Closed\",\n",
+    "                }\n",
+    "            },\n",
+    "            {\n",
+    "                # 第四步：指定返回文档的字段\n",
+    "                \"$project\": {\n",
+    "                    \"_id\": 0,\n",
+    "                    \"query_key\": \"$key\",\n",
+    "                    \"created\": \"$history.created\",\n",
+    "                    \"field\": \"$history.items.field\",\n",
+    "                    \"to\": \"$history.items.to\",\n",
+    "                    \"toString\": \"$history.items.toString\",\n",
+    "                }\n",
+    "            },\n",
+    "        ]\n",
+    "\n",
+    "        # 查询数据库\n",
+    "        query = list(histories_collection.aggregate(pipeline))\n",
+    "        # 转换为DataFrame\n",
+    "        query_df = pd.DataFrame(query)\n",
+    "        # print(query_df.head())\n",
+    "\n",
+    "        print(\n",
+    "            f\"❕ Test print: {len(issue_df)} issues before merged with query DataFrame\"\n",
+    "        )\n",
+    "\n",
+    "        # 合并DataFrame，基于key与query_key匹配\n",
+    "        merged_df = pd.merge(\n",
+    "            issue_df,\n",
+    "            query_df,\n",
+    "            left_on=\"key\",\n",
+    "            right_on=\"query_key\",\n",
+    "            how=\"left\",\n",
+    "        )\n",
+    "\n",
+    "        print(\n",
+    "            f\"❕ Test print: {len(merged_df)} issues after merged with query DataFrame\"\n",
+    "        )\n",
+    "\n",
+    "        # 将merged_df中的created值赋给合并后DataFrame的closed_time字段\n",
+    "        merged_df[\"closed_time\"] = pd.to_datetime(merged_df[\"created\"], errors=\"coerce\")\n",
+    "\n",
+    "        # 裁切出需要的字段\n",
+    "        issue_df = merged_df[list(issue_df.columns)]\n",
+    "\n",
+    "        # 最后，由于Issue可能会被多次开启与关闭，所以，保留最后一次关闭时间\n",
+    "        # 按照key进行分组，使用groupby和apply处理每个分组\n",
+    "        result_df = (\n",
+    "            issue_df.groupby(\"key\", as_index=False)\n",
+    "            .apply(handle_group)\n",
+    "            .reset_index(drop=True)\n",
+    "        )\n",
+    "        # 统一时间格式\n",
+    "        result_df[\"closed_time\"] = result_df[\"closed_time\"].apply(\n",
+    "            lambda x: x.strftime(\"%Y-%m-%d %H:%M:%S\") if pd.notna(x) else np.nan\n",
+    "        )\n",
+    "        # 把key列重新设置为索引列\n",
+    "        result_df = result_df.set_index(\"key\")\n",
+    "\n",
+    "        print(f\"❕ Test print: {len(result_df)} issues after processed done\")\n",
+    "\n",
+    "    return result_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "3401742e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def query_link_created_time(eco_name: str, link_df: pd.DataFrame):\n",
+    "    # 查询Issue的history，获取链接创建时间\n",
+    "\n",
+    "    # 定义一个函数，用于处理每个分组\n",
+    "    def handle_group(group):\n",
+    "        # 如果分组内的link_created_time全为NaN，则保留该分组的第一行\n",
+    "        if group[\"link_created_time\"].isna().all():\n",
+    "            return group.iloc[0:1]\n",
+    "        # 否则，返回link_created_time最大值对应的行\n",
+    "        else:\n",
+    "            return group.loc[[group[\"link_created_time\"].idxmax()]]\n",
+    "\n",
+    "    # 裁切出需要的字段\n",
+    "    link_df = link_df[[\"link_type\", \"in_issue_key\", \"out_issue_key\"]]\n",
+    "    # 创建链接创建时间列\n",
+    "    link_df[\"link_created_time\"] = None\n",
+    "\n",
+    "    with MongoClient() as client:\n",
+    "        # 链接数据库\n",
+    "        db = client[\"JiraEcos\"]\n",
+    "        histories_collection = db[eco_name + \"Histories\"]\n",
+    "\n",
+    "        # 首先取出需要查询的Issue keys并移除重复项\n",
+    "        out_issue_keys = link_df[\"out_issue_key\"].tolist()\n",
+    "        out_issue_keys = list(set(out_issue_keys))\n",
+    "\n",
+    "        # 构造聚合查询管道\n",
+    "        pipeline = [\n",
+    "            {\n",
+    "                # 第一步：筛选key在out_issue_keys列表中的文档\n",
+    "                # 从而保证只取出有链接的Issue的更改事件\n",
+    "                \"$match\": {\"key\": {\"$in\": out_issue_keys}}\n",
+    "            },\n",
+    "            {\n",
+    "                # 第二步：展开history.items数组\n",
+    "                # 进而返回每个具体更改事件\n",
+    "                \"$unwind\": \"$history.items\"\n",
+    "            },\n",
+    "            {\n",
+    "                # 第三步：再次筛选满足特定field值的展开后的文档\n",
+    "                # field保证更改事件是链接创建或删除\n",
+    "                # to或toString字段不为空保证是创建链接的事件而不是删除\n",
+    "                \"$match\": {\n",
+    "                    \"history.items.field\": {\n",
+    "                        \"$in\": [\"Link\", \"Epic Child\", \"Parent\", \"Parent Issue\"]\n",
+    "                    },\n",
+    "                    \"$or\": [\n",
+    "                        {\"history.items.to\": {\"$ne\": None}},\n",
+    "                        {\"history.items.toString\": {\"$ne\": None}},\n",
+    "                    ],\n",
+    "                }\n",
+    "            },\n",
+    "            {\n",
+    "                # 第四步：指定返回文档的格式\n",
+    "                # target_key根据链接类型获取to或toString字段信息\n",
+    "                \"$project\": {\n",
+    "                    \"_id\": 0,\n",
+    "                    \"key\": 1,\n",
+    "                    \"created\": \"$history.created\",\n",
+    "                    \"field\": \"$history.items.field\",\n",
+    "                    \"to\": \"$history.items.to\",\n",
+    "                    \"toString\": \"$history.items.toString\",\n",
+    "                    \"target_key\": {\n",
+    "                        \"$cond\": {\n",
+    "                            \"if\": {\"$eq\": [\"$history.items.field\", \"Link\"]},\n",
+    "                            \"then\": \"$history.items.to\",\n",
+    "                            \"else\": \"$history.items.toString\",\n",
+    "                        }\n",
+    "                    },\n",
+    "                }\n",
+    "            },\n",
+    "        ]\n",
+    "\n",
+    "        # 查询数据库\n",
+    "        query = list(histories_collection.aggregate(pipeline))\n",
+    "        # 转换为DataFrame\n",
+    "        query_df = pd.DataFrame(query)\n",
+    "        # print(query_df.head())\n",
+    "\n",
+    "        # 合并DataFrame，基于out_issue_key和key匹配，in_issue_key和target_key匹配\n",
+    "        merged_df = pd.merge(\n",
+    "            link_df,\n",
+    "            query_df,\n",
+    "            left_on=[\"out_issue_key\", \"in_issue_key\"],\n",
+    "            right_on=[\"key\", \"target_key\"],\n",
+    "            how=\"left\",\n",
+    "        )\n",
+    "\n",
+    "        # 将merged_df中的created值赋给合并后DataFrame的link_created_time字段\n",
+    "        merged_df[\"link_created_time\"] = merged_df[\"created\"]\n",
+    "\n",
+    "        # 裁切出需要的字段\n",
+    "        link_df = merged_df[\n",
+    "            [\"link_type\", \"in_issue_key\", \"out_issue_key\", \"link_created_time\"]\n",
+    "        ]\n",
+    "\n",
+    "        # 最后，由于in_issue和out_issue之间可能会发生相同类型链接的多次创建活动\n",
+    "        # 所以，保留最后一次链接创建时间\n",
+    "        # 转换link_created_time为datetime以确保比较的准确性\n",
+    "        link_df[\"link_created_time\"] = pd.to_datetime(\n",
+    "            link_df[\"link_created_time\"], errors=\"coerce\"\n",
+    "        )\n",
+    "\n",
+    "        # 按照除link_created_time以外的所有字段进行分组，使用groupby和apply处理每个分组\n",
+    "        result_df = (\n",
+    "            link_df.groupby(\n",
+    "                [\"link_type\", \"in_issue_key\", \"out_issue_key\"], as_index=False\n",
+    "            )\n",
+    "            .apply(handle_group)\n",
+    "            .reset_index(drop=True)\n",
+    "        )\n",
+    "        # 统一时间格式\n",
+    "        result_df[\"link_created_time\"] = result_df[\"link_created_time\"].apply(\n",
+    "            lambda x: x.strftime(\"%Y-%m-%d %H:%M:%S\") if pd.notna(x) else np.nan\n",
+    "        )\n",
+    "\n",
+    "    return result_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "b42827a1",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✔ Loaded 502297 raw issues and 405070 raw links for RedHat\n",
+      "Left with 268935 links after removing link duplication\n",
+      "Left with 249733 links after removing half-private issues\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_523178/4239802332.py:23: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  link_df.drop_duplicates(subset=[\"link_key\"], keep=False, inplace=True)\n",
+      "/tmp/ipykernel_523178/4239802332.py:27: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  link_df[\"sorted_issue_keys\"] = link_df.apply(column_transform, axis=1)\n",
+      "100%|██████████| 4004/4004 [01:48<00:00, 36.81it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Left with 238349 links after removing issue-pairs with multiple types of links between them\n",
+      "Left with 238053 links after removing issue-pairs with duplicate same type of links\n",
+      "✔ Cleaned 502297 issues for RedHat\n",
+      "✔ Cleaned 238053 links for RedHat\n",
+      "✔ Link type distribution:\n",
+      "link_type\n",
+      "Epic            67799\n",
+      "Subtask         45020\n",
+      "Related         44222\n",
+      "Cloners         29629\n",
+      "Blocks          21106\n",
+      "Incorporates    12847\n",
+      "Duplicate        7080\n",
+      "Causality        4122\n",
+      "Depend           2849\n",
+      "Document         1652\n",
+      "Issue split       694\n",
+      "Account           568\n",
+      "Triggers          465\n",
+      "Name: count, dtype: int64\n",
+      "❕ Test print: 502297 issues before merged with query DataFrame\n",
+      "❕ Test print: 561112 issues after merged with query DataFrame\n",
+      "❕ Test print: 502297 issues after processed done\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_523178/834440382.py:16: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  link_df[\"link_created_time\"] = None\n",
+      "/tmp/ipykernel_523178/834440382.py:100: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  link_df[\"link_created_time\"] = pd.to_datetime(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ ----------------------------\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "for eco_name in ECO_NAMES:\n",
+    "    # 加载Issue和链接数据DataFrame\n",
+    "    issue_df = load_issues(eco_name)\n",
+    "    link_df = load_links(eco_name)\n",
+    "    print(\n",
+    "        f\"✔ Loaded {len(issue_df)} raw issues and {len(link_df)} raw links for {eco_name}\"\n",
+    "    )\n",
+    "\n",
+    "    # 对Issue和链接数据进行清理\n",
+    "    issue_df = clean_issues(issue_df)\n",
+    "    link_df = clean_links(link_df, issue_df)\n",
+    "    print(f\"✔ Cleaned {len(issue_df)} issues for {eco_name}\")\n",
+    "    print(f\"✔ Cleaned {len(link_df)} links for {eco_name}\")\n",
+    "\n",
+    "    # 打印不同链接类型分布\n",
+    "    print(\"✔ Link type distribution:\")\n",
+    "    print(link_df[\"link_type\"].value_counts())\n",
+    "\n",
+    "    # 添加Issue关闭时间\n",
+    "    issue_df = query_issue_closed_time(eco_name, issue_df)\n",
+    "\n",
+    "    # 添加链接创建时间\n",
+    "    link_df = query_link_created_time(eco_name, link_df)\n",
+    "\n",
+    "    # 联合Issue和链接数据\n",
+    "    link_df = joined_links(link_df, issue_df)\n",
+    "\n",
+    "    # 保存清理后的Issue和链接数据\n",
+    "    issue_df.to_csv(\n",
+    "        PRO_ISSUE_DIR / (eco_name + \".csv\"),\n",
+    "        sep=\";\",\n",
+    "        index=True,  #! issue_df的key被设置为了索引列，所以这里需要保存\n",
+    "    )\n",
+    "    link_df.to_csv(\n",
+    "        PRO_LINK_DIR / (eco_name + \".csv\"),\n",
+    "        sep=\";\",\n",
+    "        index=False,\n",
+    "    )\n",
+    "\n",
+    "    print(\"✅ ----------------------------\\n\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/1_empirical_analysis/scripts/rq1.ipynb
+++ b/1_empirical_analysis/scripts/rq1.ipynb
--- a/1_empirical_analysis/scripts/rq2.ipynb
+++ b/1_empirical_analysis/scripts/rq2.ipynb