完成实证分析部分实验

This commit is contained in:
huaian_zhou 2024-03-23 16:41:55 +08:00
parent cbf41e55be
commit 6d8ab9c802
44 changed files with 28861 additions and 0 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,341 @@
{
"Apache": {
"Blocked": {
"id": "12310361",
"name": "Blocked",
"inward": "Blocked",
"outward": "Blocked",
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310361"
},
"Blocker": {
"id": "10032",
"name": "Blocker",
"inward": "is blocked by",
"outward": "blocks",
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/10032"
},
"Child-Issue": {
"id": "12310460",
"name": "Child-Issue",
"inward": "is a child of",
"outward": "is a parent of",
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310460"
},
"Cloners": {
"id": "10020",
"name": "Cloners",
"inward": "is cloned by",
"outward": "is a clone of",
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/10020"
},
"Completes": {
"id": "12310660",
"name": "Completes",
"inward": "is fixed by",
"outward": "fixes",
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310660"
},
"Container": {
"id": "12310060",
"name": "Container",
"inward": "Is contained by",
"outward": "contains",
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310060"
},
"Dependency": {
"id": "12310461",
"name": "Dependency",
"inward": "Dependency",
"outward": "Dependency",
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310461"
},
"Dependent": {
"id": "12310360",
"name": "Dependent",
"inward": "Dependent",
"outward": "Dependent",
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310360"
},
"Duplicate": {
"id": "12310000",
"name": "Duplicate",
"inward": "is duplicated by",
"outward": "duplicates",
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310000"
},
"Incorporates": {
"id": "12310010",
"name": "Incorporates",
"inward": "is part of",
"outward": "incorporates",
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310010"
},
"Issue split": {
"id": "12310761",
"name": "Issue split",
"inward": "split from",
"outward": "split to",
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310761"
},
"Parent Feature": {
"id": "12310462",
"name": "Parent Feature",
"inward": "Parent Feature",
"outward": "Parent Feature",
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310462"
},
"Problem/Incident": {
"id": "12310560",
"name": "Problem/Incident",
"inward": "is caused by",
"outward": "causes",
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310560"
},
"Reference": {
"id": "10030",
"name": "Reference",
"inward": "is related to",
"outward": "relates to",
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/10030"
},
"Regression": {
"id": "12310050",
"name": "Regression",
"inward": "is broken by",
"outward": "breaks",
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310050"
},
"Related": {
"id": "12310260",
"name": "Related",
"inward": "is related to",
"outward": "relates to",
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310260"
},
"Required": {
"id": "12310040",
"name": "Required",
"inward": "is required by",
"outward": "requires",
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310040"
},
"Supercedes": {
"id": "12310051",
"name": "Supercedes",
"inward": "is superceded by",
"outward": "supercedes",
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310051"
},
"Testing": {
"id": "12310760",
"name": "Testing",
"inward": "Discovered while testing",
"outward": "Testing discovered",
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/12310760"
},
"dependent": {
"id": "10001",
"name": "dependent",
"inward": "is depended upon by",
"outward": "depends upon",
"self": "https://issues.apache.org/jira/rest/api/2/issueLinkType/10001"
}
},
"Jira": {},
"Mojang": {
"Blocks": {
"id": "10100",
"name": "Blocks",
"inward": "is blocked by",
"outward": "blocks",
"self": "https://bugs.mojang.com/rest/api/2/issueLinkType/10100"
},
"Bonfire Testing": {
"id": "10000",
"name": "Bonfire Testing",
"inward": "discovered while testing",
"outward": "testing discovered",
"self": "https://bugs.mojang.com/rest/api/2/issueLinkType/10000"
},
"Cloners": {
"id": "10101",
"name": "Cloners",
"inward": "is cloned by",
"outward": "clones",
"self": "https://bugs.mojang.com/rest/api/2/issueLinkType/10101"
},
"Duplicate": {
"id": "10102",
"name": "Duplicate",
"inward": "is duplicated by",
"outward": "duplicates",
"self": "https://bugs.mojang.com/rest/api/2/issueLinkType/10102"
},
"Problem/Incident": {
"id": "10500",
"name": "Problem/Incident",
"inward": "is caused by",
"outward": "causes",
"self": "https://bugs.mojang.com/rest/api/2/issueLinkType/10500"
},
"Relates": {
"id": "10103",
"name": "Relates",
"inward": "relates to",
"outward": "relates to",
"self": "https://bugs.mojang.com/rest/api/2/issueLinkType/10103"
}
},
"MongoDB": {},
"Qt": {
"Blocks": {
"id": "10282",
"name": "Blocks",
"inward": "is blocked by",
"outward": "blocks",
"self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10282"
},
"Cloners": {
"id": "10281",
"name": "Cloners",
"inward": "is cloned by",
"outward": "clones",
"self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10281"
},
"Covered": {
"id": "10381",
"name": "Covered",
"inward": "is covered by",
"outward": "covers",
"self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10381"
},
"Dependency": {
"id": "10001",
"name": "Dependency",
"inward": "is required for",
"outward": "depends on",
"self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10001"
},
"Duplicate": {
"id": "10180",
"name": "Duplicate",
"inward": "is duplicated by",
"outward": "duplicates",
"self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10180"
},
"Issue split": {
"id": "10280",
"name": "Issue split",
"inward": "split from",
"outward": "split to",
"self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10280"
},
"Relates": {
"id": "10070",
"name": "Relates",
"inward": "relates to",
"outward": "relates to",
"self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10070"
},
"Replacement": {
"id": "10031",
"name": "Replacement",
"inward": "replaces",
"outward": "is replaced by",
"self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10031"
},
"Test": {
"id": "10020",
"name": "Test",
"inward": "Is tested by",
"outward": "tests",
"self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10020"
},
"Work Breakdown": {
"id": "10040",
"name": "Work Breakdown",
"inward": "resulted from",
"outward": "resulted in",
"self": "https://bugreports.qt.io/rest/api/2/issueLinkType/10040"
}
},
"RedHat": {
"Account": {
"id": "12310920",
"name": "Account",
"inward": "account is impacted by",
"outward": "impacts account",
"self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310920"
},
"Blocks": {
"id": "12310720",
"name": "Blocks",
"inward": "is blocked by",
"outward": "blocks",
"self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310720"
},
"Causality": {
"id": "12310220",
"name": "Causality",
"inward": "is caused by",
"outward": "causes",
"self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310220"
},
"Cloners": {
"id": "12310120",
"name": "Cloners",
"inward": "is cloned by",
"outward": "clones",
"self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310120"
},
"Depend": {
"id": "12311220",
"name": "Depend",
"inward": "is depended on by",
"outward": "depends on",
"self": "https://issues.redhat.com/rest/api/2/issueLinkType/12311220"
},
"Document": {
"id": "12310420",
"name": "Document",
"inward": "is documented by",
"outward": "documents",
"self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310420"
},
"Duplicate": {
"id": "12310000",
"name": "Duplicate",
"inward": "is duplicated by",
"outward": "duplicates",
"self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310000"
},
"Incorporates": {
"id": "10011",
"name": "Incorporates",
"inward": "is incorporated by",
"outward": "incorporates",
"self": "https://issues.redhat.com/rest/api/2/issueLinkType/10011"
},
"Issue split": {
"id": "12311720",
"name": "Issue split",
"inward": "split from",
"outward": "split to",
"self": "https://issues.redhat.com/rest/api/2/issueLinkType/12311720"
},
"Related": {
"id": "12310001",
"name": "Related",
"inward": "is related to",
"outward": "relates to",
"self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310001"
},
"Triggers": {
"id": "12310723",
"name": "Triggers",
"inward": "is triggered by",
"outward": "is triggering",
"self": "https://issues.redhat.com/rest/api/2/issueLinkType/12310723"
}
}
}

View File

@ -0,0 +1,7 @@
;Year;Issues;DIT;UIT;Links;DLT;ULT;UP;Changes;Ch/I;Comments;Co/I
RedHat;2001.0;502297.0;79.0;64.0;268935.0;11.0;11.0;807.0;7197717.0;14.0;1115471.0;2.0
Qt;2005.0;180574.0;15.0;15.0;58621.0;10.0;10.0;60.0;2307707.0;13.0;507214.0;3.0
Sum;;682871.0;94.0;79.0;327556.0;21.0;21.0;867.0;9505424.0;27.0;1622685.0;5.0
Mean;;341435.5;47.0;39.5;163778.0;10.5;10.5;433.5;4752712.0;13.5;811342.5;2.5
Median;;341435.5;47.0;39.5;163778.0;10.5;10.5;433.5;4752712.0;13.5;811342.5;2.5
Std Dev;;160861.5;32.0;24.5;105157.0;0.5;0.5;373.5;2445005.0;0.5;304128.5;0.5
1 Year Issues DIT UIT Links DLT ULT UP Changes Ch/I Comments Co/I
2 RedHat 2001.0 502297.0 79.0 64.0 268935.0 11.0 11.0 807.0 7197717.0 14.0 1115471.0 2.0
3 Qt 2005.0 180574.0 15.0 15.0 58621.0 10.0 10.0 60.0 2307707.0 13.0 507214.0 3.0
4 Sum 682871.0 94.0 79.0 327556.0 21.0 21.0 867.0 9505424.0 27.0 1622685.0 5.0
5 Mean 341435.5 47.0 39.5 163778.0 10.5 10.5 433.5 4752712.0 13.5 811342.5 2.5
6 Median 341435.5 47.0 39.5 163778.0 10.5 10.5 433.5 4752712.0 13.5 811342.5 2.5
7 Std Dev 160861.5 32.0 24.5 105157.0 0.5 0.5 373.5 2445005.0 0.5 304128.5 0.5

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,734 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "e35d2209-3e5b-4cd7-a702-2eed1badf800",
"metadata": {
"execution": {
"iopub.execute_input": "2022-01-25T09:42:06.183120Z",
"iopub.status.busy": "2022-01-25T09:42:06.182949Z",
"iopub.status.idle": "2022-01-25T09:42:06.839486Z",
"shell.execute_reply": "2022-01-25T09:42:06.838906Z",
"shell.execute_reply.started": "2022-01-25T09:42:06.183099Z"
},
"tags": []
},
"outputs": [],
"source": [
"import json\n",
"from time import time\n",
"import pandas as pd\n",
"import numpy as np\n",
"from pymongo import MongoClient\n",
"from statistics import mean, median\n",
"\n",
"# 确保DataFrame的列长不会被截断\n",
"pd.set_option(\"display.max_colwidth\", None)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "cb035275-5360-43cc-8dec-e7d1df4c7417",
"metadata": {
"execution": {
"iopub.execute_input": "2022-01-25T09:42:06.842313Z",
"iopub.status.busy": "2022-01-25T09:42:06.841976Z",
"iopub.status.idle": "2022-01-25T09:42:06.867490Z",
"shell.execute_reply": "2022-01-25T09:42:06.866866Z",
"shell.execute_reply.started": "2022-01-25T09:42:06.842270Z"
},
"tags": []
},
"outputs": [],
"source": [
"# 加载Jira软件生态元数据\n",
"with open(\"../data/jira_ecos_info.json\") as f:\n",
" jira_ecos_info = json.load(f)\n",
"\n",
"# 加载生态中使用的Issue类型信息使用`data_crawl.ipynb`下载)\n",
"with open(\"../data/eco_issue_types.json\") as f:\n",
" eco_issue_types = json.load(f)\n",
"\n",
"# 加载生态中使用的链接类型信息(使用`data_crawl.ipynb`下载)\n",
"with open(\"../data/eco_link_types.json\") as f:\n",
" eco_link_types = json.load(f)\n",
"\n",
"# 连接到数据库\n",
"client = MongoClient()\n",
"db = client[\"JiraEcos\"]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "07c7b0c3",
"metadata": {},
"outputs": [],
"source": [
"# ALL_ECOS = [name for name in jira_eco_sources.keys()]\n",
"ALL_ECOS = [\"RedHat\", \"Qt\"]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "6897e14d",
"metadata": {},
"outputs": [],
"source": [
"# 格式化时间间隔\n",
"def format_duration(start_time, end_time):\n",
" # 计算总秒数\n",
" seconds = end_time - start_time\n",
" # 计算分钟和小时数\n",
" minutes = int(seconds / 60)\n",
" hours = int(minutes / 60)\n",
" display_minutes = int(minutes % 60)\n",
" display_seconds = int(seconds % 60)\n",
"\n",
" return f\"{hours:02}:{display_minutes:02}:{display_seconds:02}\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "d1f407e3-d97b-4125-9723-35b613b42534",
"metadata": {
"execution": {
"iopub.execute_input": "2022-01-25T09:42:06.923021Z",
"iopub.status.busy": "2022-01-25T09:42:06.922589Z",
"iopub.status.idle": "2022-01-25T09:42:06.928681Z",
"shell.execute_reply": "2022-01-25T09:42:06.927331Z",
"shell.execute_reply.started": "2022-01-25T09:42:06.922989Z"
},
"tags": []
},
"outputs": [],
"source": [
"ecos_df = pd.DataFrame(\n",
" np.nan,\n",
" columns=[\n",
" \"Year\", # 生态创建时间\n",
" \"Issues\", # Issue总数\n",
" \"DIT\", # Documented Issue Types登记的Issue类型数\n",
" \"UIT\", # Used Issue Types使用的Issue类型数\n",
" \"Links\", # 链接总数\n",
" \"DLT\", # Documented Link Types登记的链接类型数\n",
" \"ULT\", # Used Link Types使用的链接类型数\n",
" \"UP\", # Unique Projects项目总数\n",
" \"Changes\", # 更改总数\n",
" \"Ch/I\", # Changes/Issues\n",
" \"Comments\", # 评论总数\n",
" \"Co/I\", # Comments/Issues\n",
" ],\n",
" index=ALL_ECOS + [\"Sum\", \"Mean\", \"Median\", \"Std Dev\"], # 总和、均值、中值、标准差\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "af17b5f7-adea-462a-a6bd-e0bf36290781",
"metadata": {
"execution": {
"iopub.execute_input": "2022-01-25T09:42:07.611366Z",
"iopub.status.busy": "2022-01-25T09:42:07.611062Z",
"iopub.status.idle": "2022-01-25T11:06:32.931020Z",
"shell.execute_reply": "2022-01-25T11:06:32.923594Z",
"shell.execute_reply.started": "2022-01-25T09:42:07.611342Z"
},
"tags": []
},
"outputs": [],
"source": [
"def populate_ecos_df(ecos_df, eco_names=ALL_ECOS):\n",
" # 填充DataFrame\n",
"\n",
" def extract_number_of_issues(eco_name):\n",
" # 查询Issue总数\n",
" issues_collection = db[eco_name]\n",
" num_issues = issues_collection.count_documents({})\n",
"\n",
" return num_issues\n",
"\n",
" def extract_number_of_doc_issuetypes(eco_name):\n",
" # 查询记录的Issue类型数\n",
" return len(eco_issue_types[eco_name])\n",
"\n",
" def extract_number_of_used_issuetypes(eco_name):\n",
" # 查询在最后状态下的Issue类型\n",
" issues_collection = db[eco_name]\n",
" final_types_query = list(\n",
" issues_collection.aggregate(\n",
" [\n",
" # 取出'$fields.issuetype.name'字段,并重命名\n",
" {\n",
" \"$project\": {\n",
" \"_id\": 0,\n",
" \"issuetype_name\": \"$fields.issuetype.name\",\n",
" }\n",
" },\n",
" # 分组把所有Issue类型名放入一个集合\n",
" {\n",
" \"$group\": {\n",
" \"_id\": None,\n",
" \"issuetype_names\": {\"$addToSet\": \"$issuetype_name\"},\n",
" }\n",
" },\n",
" ]\n",
" )\n",
" )\n",
" unique_final_issuetypes = (\n",
" set(final_types_query[0][\"issuetype_names\"])\n",
" if final_types_query != []\n",
" else set()\n",
" )\n",
"\n",
" # 查询在历史中使用过的Issue类型\n",
" histories_collection = db[eco_name + \"Histories\"]\n",
" history_types_query = list(\n",
" histories_collection.aggregate(\n",
" [\n",
" # 展开'$history.items'数组\n",
" {\"$unwind\": \"$history.items\"},\n",
" # 筛选更改项item的域为'issuetype'的文档\n",
" {\"$match\": {\"history.items.field\": \"issuetype\"}},\n",
" # 取出item的'fromString'即更改前的Issue类型\n",
" # !注意更改后的Issue类型'toString'会在下一次更改中作为更改前的值\n",
" {\n",
" \"$project\": {\n",
" \"_id\": 0,\n",
" \"issuetype_name\": \"$history.items.fromString\",\n",
" }\n",
" },\n",
" # 分组把所有Issue类型名放入一个集合\n",
" {\n",
" \"$group\": {\n",
" \"_id\": None,\n",
" \"issuetype_names\": {\"$addToSet\": \"$issuetype_name\"},\n",
" }\n",
" },\n",
" ]\n",
" )\n",
" )\n",
" unique_history_issuetypes = (\n",
" set(history_types_query[0][\"issuetype_names\"])\n",
" if history_types_query != []\n",
" else set()\n",
" )\n",
" # 合并两个集合\n",
" return len(set.union(unique_final_issuetypes, unique_history_issuetypes))\n",
"\n",
" def extract_number_of_links(eco_name):\n",
" issues_collection = db[eco_name]\n",
" # 查询链接总数\n",
" links_query = list(\n",
" issues_collection.aggregate(\n",
" [\n",
" # 筛选'$fields.issuelinks'字段非空的文档\n",
" {\"$match\": {\"fields.issuelinks\": {\"$exists\": True, \"$ne\": []}}},\n",
" # 取出issuelink的id字段数组\n",
" {\n",
" \"$project\": {\n",
" \"_id\": 0,\n",
" \"issuelink_ids_issue\": \"$fields.issuelinks.id\",\n",
" }\n",
" },\n",
" # 把id字段数组展开\n",
" {\"$unwind\": \"$issuelink_ids_issue\"},\n",
" # 统计链接的id\n",
" {\n",
" \"$group\": {\n",
" \"_id\": None,\n",
" \"issuelink_unique_ids\": {\n",
" \"$addToSet\": \"$issuelink_ids_issue\"\n",
" },\n",
" }\n",
" },\n",
" ]\n",
" )\n",
" )\n",
" num_issuelinks = (\n",
" len(set(links_query[0][\"issuelink_unique_ids\"])) if links_query != [] else 0\n",
" )\n",
"\n",
" # 查询subtask链接总数\n",
" subtasks_query = list(\n",
" issues_collection.aggregate(\n",
" [\n",
" # 筛选'$fields.subtasks'字段非空的文档\n",
" {\"$match\": {\"fields.subtasks\": {\"$exists\": True, \"$ne\": []}}},\n",
" # 计算Issue的subtask数量\n",
" {\n",
" \"$project\": {\n",
" \"_id\": 0,\n",
" \"num_issue_subtasks\": {\"$size\": \"$fields.subtasks\"},\n",
" }\n",
" },\n",
" # 计算整个集合内subtask数量\n",
" {\n",
" \"$group\": {\n",
" \"_id\": None,\n",
" \"num_subtasks\": {\"$sum\": \"$num_issue_subtasks\"},\n",
" }\n",
" },\n",
" ]\n",
" )\n",
" )\n",
" num_subtasks = subtasks_query[0][\"num_subtasks\"] if subtasks_query != [] else 0\n",
"\n",
" # 查询epic链接总数\n",
" # epic链接字段是自定义的\n",
" EPICLINK_FIELD_DICT = {\n",
" \"Apache\": \"customfield_12311120\",\n",
" \"Jira\": \"customfield_12931\",\n",
" \"Mojang\": \"customfield_11602\",\n",
" \"MongoDB\": \"customfield_10857\",\n",
" \"Qt\": \"customfield_10400\",\n",
" \"RedHat\": \"customfield_12311140\",\n",
" }\n",
" epiclinks_query = list(\n",
" issues_collection.aggregate(\n",
" [\n",
" # 把自定义epic链接字段统一重命名为'epiclink_field'\n",
" {\n",
" \"$project\": {\n",
" \"epiclink_field\": f\"$fields.{EPICLINK_FIELD_DICT[eco_name]}\"\n",
" }\n",
" },\n",
" # 筛选epiclink字段非空的文档\n",
" # !注意epic链接是由子Issue指向父Issue的\n",
" {\"$match\": {\"epiclink_field\": {\"$exists\": True, \"$ne\": None}}},\n",
" # 统计聚合的文档总数\n",
" {\"$count\": \"num_epiclinks\"},\n",
" ]\n",
" )\n",
" )\n",
" num_epiclinks = (\n",
" epiclinks_query[0][\"num_epiclinks\"] if epiclinks_query != [] else 0\n",
" )\n",
"\n",
" return sum([num_issuelinks, num_subtasks, num_epiclinks])\n",
"\n",
" def extract_number_of_doc_linktypes(eco_name):\n",
" # 查询记录的链接类型数\n",
" return len(eco_link_types[eco_name])\n",
"\n",
" def extract_number_of_used_linktypes(eco_name):\n",
" issues_collection = db[eco_name]\n",
" # 查询在最后状态下的链接类型\n",
" final_linktypes_query = list(\n",
" issues_collection.aggregate(\n",
" [\n",
" # 展开issuelinks数组\n",
" {\"$unwind\": \"$fields.issuelinks\"},\n",
" # 选择链接类型名字段\n",
" {\n",
" \"$project\": {\n",
" \"_id\": 0,\n",
" \"linktype_name\": \"$fields.issuelinks.type.name\",\n",
" }\n",
" },\n",
" # 分组,把所有链接类型名加入集合\n",
" {\n",
" \"$group\": {\n",
" \"_id\": None,\n",
" \"linktype_names\": {\"$addToSet\": \"$linktype_name\"},\n",
" }\n",
" },\n",
" ]\n",
" )\n",
" )\n",
"\n",
" return (\n",
" len(set(final_linktypes_query[0][\"linktype_names\"]))\n",
" if final_linktypes_query != []\n",
" else 0\n",
" )\n",
"\n",
" def extract_born(eco_name):\n",
" issues_collection = db[eco_name]\n",
" # 取出最初的N个Issue创建时间检查生态的最早创建时间\n",
" created_dates = [\n",
" issue[\"fields\"][\"created\"]\n",
" for issue in issues_collection.aggregate(\n",
" [\n",
" # 取出Issue创建时间\n",
" {\"$project\": {\"_id\": 0, \"fields.created\": 1}},\n",
" # 按创建时间升序排列\n",
" {\"$sort\": {\"fields.created\": 1}},\n",
" # 实际中有些Issue会损坏或者是测试Issue所以需要手动检查创建时间\n",
" {\"$limit\": 500},\n",
" ]\n",
" )\n",
" ]\n",
" # 手动检查创建时间把损坏的或测试Issue的创建时间略过\n",
" if eco_name == \"Apache\":\n",
" created_dates = created_dates[289:]\n",
" elif eco_name == \"Jira\":\n",
" created_dates = created_dates[1:]\n",
" elif eco_name == \"Qt\":\n",
" created_dates = created_dates[7:]\n",
"\n",
" return float(created_dates[0][:4])\n",
"\n",
" def extract_number_of_changes(eco_name):\n",
" # 查询更改总数\n",
" histories_collection = db[eco_name + \"Histories\"]\n",
" changes_query = list(\n",
" histories_collection.aggregate(\n",
" [\n",
" # 取出更改对应的域(数组)\n",
" {\"$project\": {\"_id\": 0, \"history.items.field\": 1}},\n",
" # 把更改数组展开\n",
" {\"$unwind\": \"$history.items\"},\n",
" # 统计更改总数\n",
" {\"$count\": \"num_changes\"},\n",
" ]\n",
" )\n",
" )\n",
"\n",
" return changes_query[0][\"num_changes\"] if changes_query != [] else 0\n",
"\n",
" def extract_number_of_unique_projects(eco_name):\n",
" # 查询在最后状态下的项目名\n",
" issues_collection = db[eco_name]\n",
" final_projects_query = list(\n",
" issues_collection.aggregate(\n",
" [\n",
" {\"$project\": {\"_id\": 0, \"project_name\": \"$fields.project.name\"}},\n",
" {\n",
" \"$group\": {\n",
" \"_id\": None,\n",
" \"project_names\": {\"$addToSet\": \"$project_name\"},\n",
" }\n",
" },\n",
" ]\n",
" )\n",
" )\n",
" unique_final_projects = (\n",
" set(final_projects_query[0][\"project_names\"])\n",
" if final_projects_query != []\n",
" else set()\n",
" )\n",
"\n",
" # 查询在历史中的项目名\n",
" histories_collection = db[eco_name + \"Histories\"]\n",
" history_projects_query = list(\n",
" histories_collection.aggregate(\n",
" [\n",
" {\"$unwind\": \"$history.items\"},\n",
" {\n",
" \"$match\": {\n",
" \"history.items.field\": {\"$in\": [\"project\", \"Project\"]}\n",
" }\n",
" },\n",
" {\n",
" \"$project\": {\n",
" \"_id\": 0,\n",
" \"project_name\": \"$history.items.fromString\",\n",
" }\n",
" },\n",
" {\n",
" \"$group\": {\n",
" \"_id\": None,\n",
" \"project_names\": {\"$addToSet\": \"$project_name\"},\n",
" }\n",
" },\n",
" ]\n",
" )\n",
" )\n",
" unique_history_projects = (\n",
" set(history_projects_query[0][\"project_names\"])\n",
" if history_projects_query != []\n",
" else set()\n",
" )\n",
"\n",
" return len(set.union(unique_final_projects, unique_history_projects))\n",
"\n",
" def extract_number_of_comments(eco_name):\n",
" # 查询评论总数\n",
" comments_collection = db[eco_name + \"Comments\"]\n",
" num_comments = comments_collection.count_documents({})\n",
" return num_comments\n",
"\n",
" start_time = time() # 记录总处理时间\n",
" for eco_name in eco_names:\n",
" eco_start_time = time() # 记录处理一个ecosystem的时间\n",
" print(f\"Working on ecosystem: {eco_name} ...\")\n",
"\n",
" # Issue总数及类型\n",
" ecos_df.loc[eco_name, \"Issues\"] = extract_number_of_issues(eco_name)\n",
" ecos_df.loc[eco_name, \"DIT\"] = extract_number_of_doc_issuetypes(eco_name)\n",
" ecos_df.loc[eco_name, \"UIT\"] = extract_number_of_used_issuetypes(eco_name)\n",
"\n",
" # Link总数及类型\n",
" ecos_df.loc[eco_name, \"Links\"] = extract_number_of_links(eco_name)\n",
" ecos_df.loc[eco_name, \"DLT\"] = extract_number_of_doc_linktypes(eco_name)\n",
" ecos_df.loc[eco_name, \"ULT\"] = extract_number_of_used_linktypes(eco_name)\n",
"\n",
" # 其他信息\n",
" ecos_df.loc[eco_name, \"Year\"] = extract_born(eco_name)\n",
" ecos_df.loc[eco_name, \"Changes\"] = extract_number_of_changes(eco_name)\n",
" ecos_df.loc[eco_name, \"Ch/I\"] = round(\n",
" ecos_df.loc[eco_name, \"Changes\"] / ecos_df.loc[eco_name, \"Issues\"]\n",
" )\n",
" ecos_df.loc[eco_name, \"UP\"] = extract_number_of_unique_projects(eco_name)\n",
" ecos_df.loc[eco_name, \"Comments\"] = extract_number_of_comments(eco_name)\n",
" ecos_df.loc[eco_name, \"Co/I\"] = round(\n",
" ecos_df.loc[eco_name, \"Comments\"] / ecos_df.loc[eco_name, \"Issues\"]\n",
" )\n",
" print(\n",
" f\"✔ {eco_name} completely processed. Duration: {format_duration(eco_start_time, time())}\"\n",
" )\n",
" print(\"\")\n",
"\n",
" print(\n",
" f\"✅ All completely processed. Total duration: {format_duration(start_time, time())}\"\n",
" )\n",
" return ecos_df"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "1698189d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Working on ecosystem: RedHat ...\n",
"✔ RedHat completely processed. Duration: 00:00:45\n",
"\n",
"Working on ecosystem: Qt ...\n",
"✔ Qt completely processed. Duration: 00:00:11\n",
"\n",
"✅ All completely processed. Total duration: 00:00:56\n"
]
}
],
"source": [
"ecos_df = populate_ecos_df(\n",
" ecos_df,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "fc015a43-fd77-46bd-8712-b65476b36d3c",
"metadata": {
"execution": {
"iopub.execute_input": "2022-01-25T11:06:33.006871Z",
"iopub.status.busy": "2022-01-25T11:06:33.006437Z",
"iopub.status.idle": "2022-01-25T11:06:33.162697Z",
"shell.execute_reply": "2022-01-25T11:06:33.161886Z",
"shell.execute_reply.started": "2022-01-25T11:06:33.006832Z"
},
"tags": []
},
"outputs": [],
"source": [
"def display_ecos_df(ecos_df):\n",
"\n",
" # 计算各列总和、均值、中值以及标准差\n",
" for header in ecos_df.columns:\n",
" if header in [\"Year\"]:\n",
" continue\n",
" ecos_df.loc[\"Sum\", header] = sum(ecos_df[header][: len(ALL_ECOS)])\n",
" ecos_df.loc[\"Mean\", header] = mean(ecos_df[header][: len(ALL_ECOS)])\n",
" ecos_df.loc[\"Median\", header] = median(ecos_df[header][: len(ALL_ECOS)])\n",
" ecos_df.loc[\"Std Dev\", header] = np.std(ecos_df[header][: len(ALL_ECOS)])\n",
"\n",
" # 格式化某些列的值\n",
" comma_separated_columns = {\n",
" col_name: \"{:,.0f}\" for col_name in [\"Issues\", \"Links\", \"Changes\", \"Comments\"]\n",
" }\n",
"\n",
" # 展示DataFrame\n",
" display(\n",
" ecos_df.style.set_table_styles(\n",
" [dict(selector=\"th\", props=[(\"text-align\", \"left\")])]\n",
" ).format(comma_separated_columns, precision=0)\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "7dfad2f7",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style type=\"text/css\">\n",
"#T_9db15 th {\n",
" text-align: left;\n",
"}\n",
"</style>\n",
"<table id=\"T_9db15\">\n",
" <thead>\n",
" <tr>\n",
" <th class=\"blank level0\" >&nbsp;</th>\n",
" <th id=\"T_9db15_level0_col0\" class=\"col_heading level0 col0\" >Year</th>\n",
" <th id=\"T_9db15_level0_col1\" class=\"col_heading level0 col1\" >Issues</th>\n",
" <th id=\"T_9db15_level0_col2\" class=\"col_heading level0 col2\" >DIT</th>\n",
" <th id=\"T_9db15_level0_col3\" class=\"col_heading level0 col3\" >UIT</th>\n",
" <th id=\"T_9db15_level0_col4\" class=\"col_heading level0 col4\" >Links</th>\n",
" <th id=\"T_9db15_level0_col5\" class=\"col_heading level0 col5\" >DLT</th>\n",
" <th id=\"T_9db15_level0_col6\" class=\"col_heading level0 col6\" >ULT</th>\n",
" <th id=\"T_9db15_level0_col7\" class=\"col_heading level0 col7\" >UP</th>\n",
" <th id=\"T_9db15_level0_col8\" class=\"col_heading level0 col8\" >Changes</th>\n",
" <th id=\"T_9db15_level0_col9\" class=\"col_heading level0 col9\" >Ch/I</th>\n",
" <th id=\"T_9db15_level0_col10\" class=\"col_heading level0 col10\" >Comments</th>\n",
" <th id=\"T_9db15_level0_col11\" class=\"col_heading level0 col11\" >Co/I</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th id=\"T_9db15_level0_row0\" class=\"row_heading level0 row0\" >RedHat</th>\n",
" <td id=\"T_9db15_row0_col0\" class=\"data row0 col0\" >2001</td>\n",
" <td id=\"T_9db15_row0_col1\" class=\"data row0 col1\" >502,297</td>\n",
" <td id=\"T_9db15_row0_col2\" class=\"data row0 col2\" >79</td>\n",
" <td id=\"T_9db15_row0_col3\" class=\"data row0 col3\" >64</td>\n",
" <td id=\"T_9db15_row0_col4\" class=\"data row0 col4\" >268,935</td>\n",
" <td id=\"T_9db15_row0_col5\" class=\"data row0 col5\" >11</td>\n",
" <td id=\"T_9db15_row0_col6\" class=\"data row0 col6\" >11</td>\n",
" <td id=\"T_9db15_row0_col7\" class=\"data row0 col7\" >807</td>\n",
" <td id=\"T_9db15_row0_col8\" class=\"data row0 col8\" >7,197,717</td>\n",
" <td id=\"T_9db15_row0_col9\" class=\"data row0 col9\" >14</td>\n",
" <td id=\"T_9db15_row0_col10\" class=\"data row0 col10\" >1,115,471</td>\n",
" <td id=\"T_9db15_row0_col11\" class=\"data row0 col11\" >2</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_9db15_level0_row1\" class=\"row_heading level0 row1\" >Qt</th>\n",
" <td id=\"T_9db15_row1_col0\" class=\"data row1 col0\" >2005</td>\n",
" <td id=\"T_9db15_row1_col1\" class=\"data row1 col1\" >180,574</td>\n",
" <td id=\"T_9db15_row1_col2\" class=\"data row1 col2\" >15</td>\n",
" <td id=\"T_9db15_row1_col3\" class=\"data row1 col3\" >15</td>\n",
" <td id=\"T_9db15_row1_col4\" class=\"data row1 col4\" >58,621</td>\n",
" <td id=\"T_9db15_row1_col5\" class=\"data row1 col5\" >10</td>\n",
" <td id=\"T_9db15_row1_col6\" class=\"data row1 col6\" >10</td>\n",
" <td id=\"T_9db15_row1_col7\" class=\"data row1 col7\" >60</td>\n",
" <td id=\"T_9db15_row1_col8\" class=\"data row1 col8\" >2,307,707</td>\n",
" <td id=\"T_9db15_row1_col9\" class=\"data row1 col9\" >13</td>\n",
" <td id=\"T_9db15_row1_col10\" class=\"data row1 col10\" >507,214</td>\n",
" <td id=\"T_9db15_row1_col11\" class=\"data row1 col11\" >3</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_9db15_level0_row2\" class=\"row_heading level0 row2\" >Sum</th>\n",
" <td id=\"T_9db15_row2_col0\" class=\"data row2 col0\" >nan</td>\n",
" <td id=\"T_9db15_row2_col1\" class=\"data row2 col1\" >682,871</td>\n",
" <td id=\"T_9db15_row2_col2\" class=\"data row2 col2\" >94</td>\n",
" <td id=\"T_9db15_row2_col3\" class=\"data row2 col3\" >79</td>\n",
" <td id=\"T_9db15_row2_col4\" class=\"data row2 col4\" >327,556</td>\n",
" <td id=\"T_9db15_row2_col5\" class=\"data row2 col5\" >21</td>\n",
" <td id=\"T_9db15_row2_col6\" class=\"data row2 col6\" >21</td>\n",
" <td id=\"T_9db15_row2_col7\" class=\"data row2 col7\" >867</td>\n",
" <td id=\"T_9db15_row2_col8\" class=\"data row2 col8\" >9,505,424</td>\n",
" <td id=\"T_9db15_row2_col9\" class=\"data row2 col9\" >27</td>\n",
" <td id=\"T_9db15_row2_col10\" class=\"data row2 col10\" >1,622,685</td>\n",
" <td id=\"T_9db15_row2_col11\" class=\"data row2 col11\" >5</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_9db15_level0_row3\" class=\"row_heading level0 row3\" >Mean</th>\n",
" <td id=\"T_9db15_row3_col0\" class=\"data row3 col0\" >nan</td>\n",
" <td id=\"T_9db15_row3_col1\" class=\"data row3 col1\" >341,436</td>\n",
" <td id=\"T_9db15_row3_col2\" class=\"data row3 col2\" >47</td>\n",
" <td id=\"T_9db15_row3_col3\" class=\"data row3 col3\" >40</td>\n",
" <td id=\"T_9db15_row3_col4\" class=\"data row3 col4\" >163,778</td>\n",
" <td id=\"T_9db15_row3_col5\" class=\"data row3 col5\" >10</td>\n",
" <td id=\"T_9db15_row3_col6\" class=\"data row3 col6\" >10</td>\n",
" <td id=\"T_9db15_row3_col7\" class=\"data row3 col7\" >434</td>\n",
" <td id=\"T_9db15_row3_col8\" class=\"data row3 col8\" >4,752,712</td>\n",
" <td id=\"T_9db15_row3_col9\" class=\"data row3 col9\" >14</td>\n",
" <td id=\"T_9db15_row3_col10\" class=\"data row3 col10\" >811,342</td>\n",
" <td id=\"T_9db15_row3_col11\" class=\"data row3 col11\" >2</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_9db15_level0_row4\" class=\"row_heading level0 row4\" >Median</th>\n",
" <td id=\"T_9db15_row4_col0\" class=\"data row4 col0\" >nan</td>\n",
" <td id=\"T_9db15_row4_col1\" class=\"data row4 col1\" >341,436</td>\n",
" <td id=\"T_9db15_row4_col2\" class=\"data row4 col2\" >47</td>\n",
" <td id=\"T_9db15_row4_col3\" class=\"data row4 col3\" >40</td>\n",
" <td id=\"T_9db15_row4_col4\" class=\"data row4 col4\" >163,778</td>\n",
" <td id=\"T_9db15_row4_col5\" class=\"data row4 col5\" >10</td>\n",
" <td id=\"T_9db15_row4_col6\" class=\"data row4 col6\" >10</td>\n",
" <td id=\"T_9db15_row4_col7\" class=\"data row4 col7\" >434</td>\n",
" <td id=\"T_9db15_row4_col8\" class=\"data row4 col8\" >4,752,712</td>\n",
" <td id=\"T_9db15_row4_col9\" class=\"data row4 col9\" >14</td>\n",
" <td id=\"T_9db15_row4_col10\" class=\"data row4 col10\" >811,342</td>\n",
" <td id=\"T_9db15_row4_col11\" class=\"data row4 col11\" >2</td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_9db15_level0_row5\" class=\"row_heading level0 row5\" >Std Dev</th>\n",
" <td id=\"T_9db15_row5_col0\" class=\"data row5 col0\" >nan</td>\n",
" <td id=\"T_9db15_row5_col1\" class=\"data row5 col1\" >160,862</td>\n",
" <td id=\"T_9db15_row5_col2\" class=\"data row5 col2\" >32</td>\n",
" <td id=\"T_9db15_row5_col3\" class=\"data row5 col3\" >24</td>\n",
" <td id=\"T_9db15_row5_col4\" class=\"data row5 col4\" >105,157</td>\n",
" <td id=\"T_9db15_row5_col5\" class=\"data row5 col5\" >0</td>\n",
" <td id=\"T_9db15_row5_col6\" class=\"data row5 col6\" >0</td>\n",
" <td id=\"T_9db15_row5_col7\" class=\"data row5 col7\" >374</td>\n",
" <td id=\"T_9db15_row5_col8\" class=\"data row5 col8\" >2,445,005</td>\n",
" <td id=\"T_9db15_row5_col9\" class=\"data row5 col9\" >0</td>\n",
" <td id=\"T_9db15_row5_col10\" class=\"data row5 col10\" >304,128</td>\n",
" <td id=\"T_9db15_row5_col11\" class=\"data row5 col11\" >0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x7f2c251df1f0>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display_ecos_df(ecos_df)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "63163049",
"metadata": {},
"outputs": [],
"source": [
"ecos_df.to_csv(\"../data/ecos_overview.csv\", sep=\";\", encoding=\"utf-8\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
},
"toc-autonumbering": false,
"toc-showcode": false,
"toc-showmarkdowntxt": false,
"toc-showtags": false
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,5 @@
ecosystem,scope,max,min,median,mean,std
RedHat,without,421,0,1.0,2.1489339812279398,3.5470614635577187
RedHat,with,117,0,1.0,2.2934478185438794,3.9874327658983035
RedHat,in,117,0,1.0,2.268535068928034,4.001612675816817
RedHat,cross,117,0,2.0,3.1102165516976346,4.951732355023365
1 ecosystem scope max min median mean std
2 RedHat without 421 0 1.0 2.1489339812279398 3.5470614635577187
3 RedHat with 117 0 1.0 2.2934478185438794 3.9874327658983035
4 RedHat in 117 0 1.0 2.268535068928034 4.001612675816817
5 RedHat cross 117 0 2.0 3.1102165516976346 4.951732355023365

View File

@ -0,0 +1,5 @@
ecosystem,scope,max,min,median,mean,std
RedHat,without,421,0,1.0,1.8907019826905265,3.290788730118782
RedHat,with,117,0,2.0,2.993302339192955,4.6213686922437915
RedHat,in,117,0,2.0,3.0957131121431085,4.758508197318559
RedHat,cross,117,0,2.0,3.1515352998065764,4.9921727596311385
1 ecosystem scope max min median mean std
2 RedHat without 421 0 1.0 1.8907019826905265 3.290788730118782
3 RedHat with 117 0 2.0 2.993302339192955 4.6213686922437915
4 RedHat in 117 0 2.0 3.0957131121431085 4.758508197318559
5 RedHat cross 117 0 2.0 3.1515352998065764 4.9921727596311385

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,2 @@
ecosystem,#issues,#issues_with_links,#links,%issues_with_links,#max_links,#min_links,#median_links,#mean_links,#link_types,#projects,#links_cross_project,%links_cross_project
RedHat,502297,249581,238053,49.69,399,1,1.0,1.9076211730860924,13,279,34866,14.65
1 ecosystem #issues #issues_with_links #links %issues_with_links #max_links #min_links #median_links #mean_links #link_types #projects #links_cross_project %links_cross_project
2 RedHat 502297 249581 238053 49.69 399 1 1.0 1.9076211730860924 13 279 34866 14.65

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,5 @@
ecosystem,scope,num_issues,num_not_closed,per_not_closed,num_closed,per_closed
RedHat,without,252716,80693,31.93,172023,68.07
RedHat,with,249581,81104,32.5,168477,67.5
RedHat,in,219867,69418,31.57,150449,68.43
RedHat,cross,52043,20816,40.0,31227,60.0
1 ecosystem scope num_issues num_not_closed per_not_closed num_closed per_closed
2 RedHat without 252716 80693 31.93 172023 68.07
3 RedHat with 249581 81104 32.5 168477 67.5
4 RedHat in 219867 69418 31.57 150449 68.43
5 RedHat cross 52043 20816 40.0 31227 60.0

View File

@ -0,0 +1,5 @@
ecosystem,scope,num_issues,num_not_closed,per_not_closed,num_closed,per_closed
RedHat,without,351946,112787,32.05,239159,67.95
RedHat,with,150351,49010,32.6,101341,67.4
RedHat,in,119294,36870,30.91,82424,69.09
RedHat,cross,49632,20120,40.54,29512,59.46
1 ecosystem scope num_issues num_not_closed per_not_closed num_closed per_closed
2 RedHat without 351946 112787 32.05 239159 67.95
3 RedHat with 150351 49010 32.6 101341 67.4
4 RedHat in 119294 36870 30.91 82424 69.09
5 RedHat cross 49632 20120 40.54 29512 59.46

View File

@ -0,0 +1,5 @@
ecosystem,scope,max,min,median,mean,std
RedHat,without,6412,0,103,460,856
RedHat,with,6283,0,113,366,660
RedHat,in,6283,0,104,340,639
RedHat,cross,5815,0,219,531,736
1 ecosystem scope max min median mean std
2 RedHat without 6412 0 103 460 856
3 RedHat with 6283 0 113 366 660
4 RedHat in 6283 0 104 340 639
5 RedHat cross 5815 0 219 531 736

View File

@ -0,0 +1,5 @@
ecosystem,scope,max,min,median,mean,std
RedHat,without,6412,0,95,405,785
RedHat,with,6283,0,140,432,720
RedHat,in,6283,0,124,399,701
RedHat,cross,5815,0,230,548,746
1 ecosystem scope max min median mean std
2 RedHat without 6412 0 95 405 785
3 RedHat with 6283 0 140 432 720
4 RedHat in 6283 0 124 399 701
5 RedHat cross 5815 0 230 548 746

Binary file not shown.

View File

@ -0,0 +1,7 @@
ecosystem,scope,interval_type,max,min,median,mean,std
RedHat,with,cti,5372,0,24,103,218
RedHat,with,lti,5829,0,0,207,729
RedHat,in,cti,5372,0,22,99,210
RedHat,in,lti,5829,0,0,180,691
RedHat,cross,cti,4815,0,33,132,268
RedHat,cross,lti,5766,0,0,408,950
1 ecosystem scope interval_type max min median mean std
2 RedHat with cti 5372 0 24 103 218
3 RedHat with lti 5829 0 0 207 729
4 RedHat in cti 5372 0 22 99 210
5 RedHat in lti 5829 0 0 180 691
6 RedHat cross cti 4815 0 33 132 268
7 RedHat cross lti 5766 0 0 408 950

View File

@ -0,0 +1,7 @@
ecosystem,scope,interval_type,max,min,median,mean,std
RedHat,with,cti,5372,0,28,120,256
RedHat,with,lti,5829,0,0,399,979
RedHat,in,cti,5372,0,28,118,251
RedHat,in,lti,5829,0,0,388,978
RedHat,cross,cti,4815,0,28,128,271
RedHat,cross,lti,5766,0,0,443,984
1 ecosystem scope interval_type max min median mean std
2 RedHat with cti 5372 0 28 120 256
3 RedHat with lti 5829 0 0 399 979
4 RedHat in cti 5372 0 28 118 251
5 RedHat in lti 5829 0 0 388 978
6 RedHat cross cti 4815 0 28 128 271
7 RedHat cross lti 5766 0 0 443 984

View File

@ -0,0 +1,14 @@
ecosystem,link_type,min,q1,median,q3,max,mean,std
RedHat,Account,0,0.0,2.0,5.0,87,3.9,6.5
RedHat,Blocks,0,1.0,2.0,4.0,117,3.5,5.5
RedHat,Causality,0,0.0,2.0,4.0,100,3.6,5.7
RedHat,Cloners,0,0.0,1.0,3.0,100,2.4,3.9
RedHat,Depend,0,0.0,1.0,3.0,91,3.0,5.0
RedHat,Document,0,0.0,2.0,5.0,63,3.6,5.3
RedHat,Duplicate,0,1.0,2.0,4.0,117,3.4,5.2
RedHat,Epic,0,0.0,1.0,2.0,80,1.7,3.3
RedHat,Incorporates,0,1.0,1.0,3.0,83,2.6,4.0
RedHat,Issue split,0,0.0,1.0,3.0,77,2.5,5.0
RedHat,Related,0,1.0,2.0,5.0,117,3.9,5.6
RedHat,Subtask,0,0.0,1.0,2.0,117,1.6,3.2
RedHat,Triggers,0,0.0,2.0,4.0,62,3.4,5.6
1 ecosystem link_type min q1 median q3 max mean std
2 RedHat Account 0 0.0 2.0 5.0 87 3.9 6.5
3 RedHat Blocks 0 1.0 2.0 4.0 117 3.5 5.5
4 RedHat Causality 0 0.0 2.0 4.0 100 3.6 5.7
5 RedHat Cloners 0 0.0 1.0 3.0 100 2.4 3.9
6 RedHat Depend 0 0.0 1.0 3.0 91 3.0 5.0
7 RedHat Document 0 0.0 2.0 5.0 63 3.6 5.3
8 RedHat Duplicate 0 1.0 2.0 4.0 117 3.4 5.2
9 RedHat Epic 0 0.0 1.0 2.0 80 1.7 3.3
10 RedHat Incorporates 0 1.0 1.0 3.0 83 2.6 4.0
11 RedHat Issue split 0 0.0 1.0 3.0 77 2.5 5.0
12 RedHat Related 0 1.0 2.0 5.0 117 3.9 5.6
13 RedHat Subtask 0 0.0 1.0 2.0 117 1.6 3.2
14 RedHat Triggers 0 0.0 2.0 4.0 62 3.4 5.6

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,14 @@
ecosystem,link_type,number,percentage,num_in,per_in,num_cross,per_cross
RedHat,Epic,67799,28.48,65360,96.4,2439,3.6
RedHat,Subtask,45020,18.91,45013,99.98,7,0.02
RedHat,Related,44222,18.58,34481,77.97,9741,22.03
RedHat,Cloners,29629,12.45,19142,64.61,10487,35.39
RedHat,Blocks,21106,8.87,16367,77.55,4739,22.45
RedHat,Incorporates,12847,5.4,9154,71.25,3693,28.75
RedHat,Duplicate,7080,2.97,6414,90.59,666,9.41
RedHat,Causality,4122,1.73,2714,65.84,1408,34.16
RedHat,Depend,2849,1.2,2311,81.12,538,18.88
RedHat,Document,1652,0.69,811,49.09,841,50.91
RedHat,Issue split,694,0.29,665,95.82,29,4.18
RedHat,Account,568,0.24,462,81.34,106,18.66
RedHat,Triggers,465,0.2,293,63.01,172,36.99
1 ecosystem link_type number percentage num_in per_in num_cross per_cross
2 RedHat Epic 67799 28.48 65360 96.4 2439 3.6
3 RedHat Subtask 45020 18.91 45013 99.98 7 0.02
4 RedHat Related 44222 18.58 34481 77.97 9741 22.03
5 RedHat Cloners 29629 12.45 19142 64.61 10487 35.39
6 RedHat Blocks 21106 8.87 16367 77.55 4739 22.45
7 RedHat Incorporates 12847 5.4 9154 71.25 3693 28.75
8 RedHat Duplicate 7080 2.97 6414 90.59 666 9.41
9 RedHat Causality 4122 1.73 2714 65.84 1408 34.16
10 RedHat Depend 2849 1.2 2311 81.12 538 18.88
11 RedHat Document 1652 0.69 811 49.09 841 50.91
12 RedHat Issue split 694 0.29 665 95.82 29 4.18
13 RedHat Account 568 0.24 462 81.34 106 18.66
14 RedHat Triggers 465 0.2 293 63.01 172 36.99

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,14 @@
ecosystem,link_type,num_issues,num_not_closed,per_not_closed,num_closed,per_closed
RedHat,Account,886,246,27.77,640,72.23
RedHat,Blocks,29857,9117,30.54,20740,69.46
RedHat,Causality,6920,2563,37.04,4357,62.96
RedHat,Cloners,48828,18147,37.17,30681,62.83
RedHat,Depend,4247,1647,38.78,2600,61.22
RedHat,Document,2795,672,24.04,2123,75.96
RedHat,Duplicate,13104,2646,20.19,10458,79.81
RedHat,Epic,77019,26516,34.43,50503,65.57
RedHat,Incorporates,16505,8090,49.02,8415,50.98
RedHat,Issue split,1071,467,43.6,604,56.4
RedHat,Related,63125,19934,31.58,43191,68.42
RedHat,Subtask,55986,15859,28.33,40127,71.67
RedHat,Triggers,779,284,36.46,495,63.54
1 ecosystem link_type num_issues num_not_closed per_not_closed num_closed per_closed
2 RedHat Account 886 246 27.77 640 72.23
3 RedHat Blocks 29857 9117 30.54 20740 69.46
4 RedHat Causality 6920 2563 37.04 4357 62.96
5 RedHat Cloners 48828 18147 37.17 30681 62.83
6 RedHat Depend 4247 1647 38.78 2600 61.22
7 RedHat Document 2795 672 24.04 2123 75.96
8 RedHat Duplicate 13104 2646 20.19 10458 79.81
9 RedHat Epic 77019 26516 34.43 50503 65.57
10 RedHat Incorporates 16505 8090 49.02 8415 50.98
11 RedHat Issue split 1071 467 43.6 604 56.4
12 RedHat Related 63125 19934 31.58 43191 68.42
13 RedHat Subtask 55986 15859 28.33 40127 71.67
14 RedHat Triggers 779 284 36.46 495 63.54

View File

@ -0,0 +1,14 @@
ecosystem,link_type,min,q1,median,q3,max,mean,std
RedHat,Account,0.0,16.9,62.9,169.7,1189.9,129.8,169.4
RedHat,Blocks,0.0,48.6,174.1,552.9,6283.1,468.0,722.8
RedHat,Causality,0.0,40.2,167.9,608.2,3832.2,440.4,587.5
RedHat,Cloners,0.0,29.1,126.6,381.8,4714.3,342.2,532.4
RedHat,Depend,0.0,23.1,68.1,155.0,2052.2,115.4,145.9
RedHat,Document,0.0,55.0,132.9,326.5,3792.5,253.9,337.0
RedHat,Duplicate,0.0,16.2,104.0,444.4,5946.3,434.5,804.4
RedHat,Epic,0.0,27.8,99.3,294.9,4253.5,233.3,329.4
RedHat,Incorporates,0.0,80.1,213.4,575.2,5900.5,486.7,684.8
RedHat,Issue split,0.0,21.0,52.9,156.3,1173.3,118.5,161.8
RedHat,Related,0.0,45.1,173.7,594.8,6111.5,511.0,814.9
RedHat,Subtask,0.0,20.0,77.0,295.0,5899.6,325.7,702.4
RedHat,Triggers,0.0,18.3,63.0,197.0,1487.8,152.2,212.5
1 ecosystem link_type min q1 median q3 max mean std
2 RedHat Account 0.0 16.9 62.9 169.7 1189.9 129.8 169.4
3 RedHat Blocks 0.0 48.6 174.1 552.9 6283.1 468.0 722.8
4 RedHat Causality 0.0 40.2 167.9 608.2 3832.2 440.4 587.5
5 RedHat Cloners 0.0 29.1 126.6 381.8 4714.3 342.2 532.4
6 RedHat Depend 0.0 23.1 68.1 155.0 2052.2 115.4 145.9
7 RedHat Document 0.0 55.0 132.9 326.5 3792.5 253.9 337.0
8 RedHat Duplicate 0.0 16.2 104.0 444.4 5946.3 434.5 804.4
9 RedHat Epic 0.0 27.8 99.3 294.9 4253.5 233.3 329.4
10 RedHat Incorporates 0.0 80.1 213.4 575.2 5900.5 486.7 684.8
11 RedHat Issue split 0.0 21.0 52.9 156.3 1173.3 118.5 161.8
12 RedHat Related 0.0 45.1 173.7 594.8 6111.5 511.0 814.9
13 RedHat Subtask 0.0 20.0 77.0 295.0 5899.6 325.7 702.4
14 RedHat Triggers 0.0 18.3 63.0 197.0 1487.8 152.2 212.5

Binary file not shown.

View File

@ -0,0 +1,27 @@
ecosystem,link_type,interval_type,min,q1,median,q3,max,mean,std
RedHat,Account,cti,0.0,7.5,28.3,85.4,2004.1,87.4,191.1
RedHat,Account,lti,0.0,0.0,0.0,3.4,371.7,12.3,41.8
RedHat,Blocks,cti,0.0,1.1,20.8,91.1,4020.7,93.5,199.4
RedHat,Blocks,lti,0.0,2.1,1119.1,2767.7,5829.9,1541.8,1622.6
RedHat,Causality,cti,0.0,7.3,43.9,149.4,3948.3,155.4,309.5
RedHat,Causality,lti,0.0,0.0,359.4,1160.0,2693.0,600.7,652.4
RedHat,Cloners,cti,0.0,0.0,13.4,71.2,2884.2,71.5,171.0
RedHat,Cloners,lti,0.0,0.0,0.0,0.0,5546.1,137.0,661.2
RedHat,Depend,cti,0.0,0.0,17.1,86.0,2884.0,79.8,180.4
RedHat,Depend,lti,0.0,0.0,0.0,4.4,874.4,13.9,46.8
RedHat,Document,cti,0.0,13.0,53.7,121.0,2700.0,119.3,221.6
RedHat,Document,lti,0.0,0.0,0.0,5.8,997.2,17.3,62.6
RedHat,Duplicate,cti,0.0,5.7,33.0,140.8,3196.8,138.3,275.9
RedHat,Duplicate,lti,0.0,0.2,3.9,34.2,2638.1,50.5,144.9
RedHat,Epic,cti,0.0,1.5,39.4,129.4,4159.7,103.3,169.9
RedHat,Epic,lti,0.0,0.0,0.0,0.0,1444.8,6.4,35.3
RedHat,Incorporates,cti,0.0,5.1,26.8,96.3,4179.5,106.5,238.7
RedHat,Incorporates,lti,0.0,0.0,0.0,6.9,3161.8,19.8,95.8
RedHat,Issue split,cti,0.0,14.0,28.0,92.8,2128.7,87.2,163.3
RedHat,Issue split,lti,0.0,0.0,0.0,107.6,1163.8,102.2,200.3
RedHat,Related,cti,0.0,7.4,46.8,165.6,5372.6,159.9,309.3
RedHat,Related,lti,0.0,0.0,0.1,36.2,3677.8,181.5,478.4
RedHat,Subtask,cti,0.0,0.0,1.6,46.2,3593.7,60.1,159.3
RedHat,Subtask,lti,0.0,0.0,0.0,0.0,3677.8,1.3,27.4
RedHat,Triggers,cti,0.0,11.3,32.2,124.0,1173.6,100.4,167.2
RedHat,Triggers,lti,0.0,0.0,0.0,0.2,2599.0,17.6,134.1
1 ecosystem link_type interval_type min q1 median q3 max mean std
2 RedHat Account cti 0.0 7.5 28.3 85.4 2004.1 87.4 191.1
3 RedHat Account lti 0.0 0.0 0.0 3.4 371.7 12.3 41.8
4 RedHat Blocks cti 0.0 1.1 20.8 91.1 4020.7 93.5 199.4
5 RedHat Blocks lti 0.0 2.1 1119.1 2767.7 5829.9 1541.8 1622.6
6 RedHat Causality cti 0.0 7.3 43.9 149.4 3948.3 155.4 309.5
7 RedHat Causality lti 0.0 0.0 359.4 1160.0 2693.0 600.7 652.4
8 RedHat Cloners cti 0.0 0.0 13.4 71.2 2884.2 71.5 171.0
9 RedHat Cloners lti 0.0 0.0 0.0 0.0 5546.1 137.0 661.2
10 RedHat Depend cti 0.0 0.0 17.1 86.0 2884.0 79.8 180.4
11 RedHat Depend lti 0.0 0.0 0.0 4.4 874.4 13.9 46.8
12 RedHat Document cti 0.0 13.0 53.7 121.0 2700.0 119.3 221.6
13 RedHat Document lti 0.0 0.0 0.0 5.8 997.2 17.3 62.6
14 RedHat Duplicate cti 0.0 5.7 33.0 140.8 3196.8 138.3 275.9
15 RedHat Duplicate lti 0.0 0.2 3.9 34.2 2638.1 50.5 144.9
16 RedHat Epic cti 0.0 1.5 39.4 129.4 4159.7 103.3 169.9
17 RedHat Epic lti 0.0 0.0 0.0 0.0 1444.8 6.4 35.3
18 RedHat Incorporates cti 0.0 5.1 26.8 96.3 4179.5 106.5 238.7
19 RedHat Incorporates lti 0.0 0.0 0.0 6.9 3161.8 19.8 95.8
20 RedHat Issue split cti 0.0 14.0 28.0 92.8 2128.7 87.2 163.3
21 RedHat Issue split lti 0.0 0.0 0.0 107.6 1163.8 102.2 200.3
22 RedHat Related cti 0.0 7.4 46.8 165.6 5372.6 159.9 309.3
23 RedHat Related lti 0.0 0.0 0.1 36.2 3677.8 181.5 478.4
24 RedHat Subtask cti 0.0 0.0 1.6 46.2 3593.7 60.1 159.3
25 RedHat Subtask lti 0.0 0.0 0.0 0.0 3677.8 1.3 27.4
26 RedHat Triggers cti 0.0 11.3 32.2 124.0 1173.6 100.4 167.2
27 RedHat Triggers lti 0.0 0.0 0.0 0.2 2599.0 17.6 134.1

View File

@ -0,0 +1,360 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"from time import time\n",
"from tqdm import tqdm\n",
"from pathlib import Path\n",
"from pymongo import MongoClient\n",
"from pymongo.database import Database"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# 软件生态名\n",
"ECO_NAMES = [\n",
" # \"Apache\",\n",
" # \"Jira\",\n",
" # \"Mojang\",\n",
" # \"MongoDB\",\n",
" # \"Qt\",\n",
" \"RedHat\",\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# 保存Issue和链接原始数据的目录\n",
"ISSUE_DIR = Path(\"../data/raw/issues\")\n",
"ISSUE_DIR.mkdir(parents=True, exist_ok=True)\n",
"LINK_DIR = Path(\"../data/raw/links\")\n",
"LINK_DIR.mkdir(parents=True, exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# 格式化时间间隔\n",
"def format_duration(start_time, end_time):\n",
" # 计算总秒数\n",
" seconds = end_time - start_time\n",
" # 计算分钟和小时数\n",
" minutes = int(seconds / 60)\n",
" hours = int(minutes / 60)\n",
" display_minutes = int(minutes % 60)\n",
" display_seconds = int(seconds % 60)\n",
"\n",
" return f\"{hours:02}:{display_minutes:02}:{display_seconds:02}\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def extract_issues_to_csv(eco_name: str, db: Database):\n",
" # 提取生态中的Issue字段数据保存至csv文件\n",
" issues_collection = db[eco_name]\n",
" # 迭代访问所有文档\n",
" cursor = issues_collection.find({})\n",
"\n",
" issues = []\n",
" # 分别记录提取的Issue和comments总数\n",
" num_comments = 0\n",
"\n",
" for issue in tqdm(cursor):\n",
" try:\n",
" key = issue[\"key\"] # Issue关键字\n",
"\n",
" project_key = issue[\"fields\"][\"project\"][\"key\"] # 所属项目关键字\n",
"\n",
" project_name = issue[\"fields\"][\"project\"][\"name\"] # 所属项目名\n",
"\n",
" try:\n",
" issue_type = issue[\"fields\"][\"issuetype\"][\"name\"] # Issue类型\n",
" except Exception:\n",
" issue_type = None\n",
"\n",
" try:\n",
" status = issue[\"fields\"][\"status\"][\"name\"] # 状态\n",
" except Exception:\n",
" status = None\n",
"\n",
" try:\n",
" resolution = issue[\"fields\"][\"resolution\"][\"name\"] # 解决与否\n",
" except Exception:\n",
" resolution = None\n",
"\n",
" try:\n",
" created_time = issue[\"fields\"][\"created\"] # 创建时间\n",
" except Exception:\n",
" created_time = None\n",
"\n",
" try:\n",
" priority = issue[\"fields\"][\"priority\"][\"name\"] # 优先级\n",
" except Exception:\n",
" priority = None\n",
"\n",
" try:\n",
" title = issue[\"fields\"][\"summary\"] # 标题\n",
" except Exception:\n",
" title = None\n",
"\n",
" try:\n",
" description = issue[\"fields\"][\"description\"] # 描述\n",
" except Exception:\n",
" description = None\n",
"\n",
" num_issue_comments = issue[\"fields\"][\"comment\"][\n",
" \"total\"\n",
" ] # 该Issue的评论数量\n",
"\n",
" issue_dict = {\n",
" \"key\": key,\n",
" \"project_key\": project_key,\n",
" \"project_name\": project_name,\n",
" \"issue_type\": issue_type,\n",
" \"status\": status,\n",
" \"resolution\": resolution,\n",
" \"created_time\": created_time,\n",
" \"priority\": priority,\n",
" \"title\": title,\n",
" \"description\": description,\n",
" \"num_comments\": num_issue_comments,\n",
" }\n",
"\n",
" issues.append(issue_dict)\n",
" num_comments += num_issue_comments\n",
"\n",
" except Exception:\n",
" pass\n",
"\n",
" filename = ISSUE_DIR / (eco_name + \".csv\")\n",
" with open(filename, \"w\", errors=\"surrogatepass\", encoding=\"utf-8\") as output_file:\n",
" dict_wirter = csv.DictWriter(output_file, issues[0].keys(), delimiter=\";\")\n",
" dict_wirter.writeheader()\n",
" dict_wirter.writerows(issues)\n",
"\n",
" print(\n",
" f\"✔ Extracted {len(issues)} raw issues with {num_comments} comments from {eco_name}\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Epic链接自定义字段\n",
"EPICLINK_FIELD_DICT = {\n",
" \"Apache\": \"customfield_12311120\",\n",
" \"Jira\": \"customfield_12931\",\n",
" \"Mojang\": \"customfield_11602\",\n",
" \"MongoDB\": \"customfield_10857\",\n",
" \"Qt\": \"customfield_10400\",\n",
" \"RedHat\": \"customfield_12311140\",\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def extract_links_to_csv(eco_name: str, db: Database):\n",
" # 提取生态中的链接数据保存至csv文件\n",
"\n",
" issues_collection = db[eco_name]\n",
"\n",
" links = []\n",
" cursor = issues_collection.find({})\n",
" for issue in tqdm(cursor):\n",
" try:\n",
" key = issue[\"key\"]\n",
"\n",
" # 提取一般类型链接\n",
" issuelinks = issue[\"fields\"][\"issuelinks\"]\n",
" for issuelink in issuelinks:\n",
" link_type = issuelink[\"type\"][\"name\"]\n",
"\n",
" try:\n",
" in_issue_key = key\n",
" out_issue_key = issuelink[\"outwardIssue\"][\"key\"]\n",
" except Exception:\n",
" in_issue_key = issuelink[\"inwardIssue\"][\"key\"]\n",
" out_issue_key = key\n",
"\n",
" link_key = in_issue_key + \"_\" + out_issue_key # 用作链接的唯一标识\n",
"\n",
" link_dict = {\n",
" \"link_key\": link_key,\n",
" \"link_type\": link_type,\n",
" \"in_issue_key\": in_issue_key,\n",
" \"out_issue_key\": out_issue_key,\n",
" }\n",
"\n",
" links.append(link_dict)\n",
"\n",
" # 提取Subtask链接\n",
" subtasks = issue[\"fields\"][\"subtasks\"]\n",
" for subtask in subtasks:\n",
"\n",
" link_type = \"Subtask\"\n",
" in_issue_key = key\n",
" out_issue_key = subtask[\n",
" \"key\"\n",
" ] # Subtask类型链接都是由父Issue指向子Issue\n",
"\n",
" link_key = in_issue_key + \"_\" + out_issue_key\n",
"\n",
" link_dict = {\n",
" \"link_key\": link_key,\n",
" \"link_type\": link_type,\n",
" \"in_issue_key\": in_issue_key,\n",
" \"out_issue_key\": out_issue_key,\n",
" }\n",
"\n",
" links.append(link_dict)\n",
"\n",
" # 提取Epic链接\n",
" try:\n",
" epic_key = issue[\"fields\"][EPICLINK_FIELD_DICT[eco_name]]\n",
" in_issue_key = key\n",
" out_issue_key = epic_key\n",
" link_key = in_issue_key + \"_\" + out_issue_key\n",
" link_type = \"Epic\"\n",
"\n",
" link_dict = {\n",
" \"link_key\": link_key,\n",
" \"link_type\": link_type,\n",
" \"in_issue_key\": in_issue_key,\n",
" \"out_issue_key\": out_issue_key,\n",
" }\n",
"\n",
" links.append(link_dict)\n",
"\n",
" except Exception:\n",
" pass\n",
"\n",
" except Exception:\n",
" pass\n",
"\n",
" filename = LINK_DIR / (eco_name + \".csv\")\n",
" with open(filename, \"w\", errors=\"surrogatepass\", encoding=\"utf-8\") as output_file:\n",
" dict_wirter = csv.DictWriter(output_file, links[0].keys(), delimiter=\";\")\n",
" dict_wirter.writeheader()\n",
" dict_wirter.writerows(links)\n",
"\n",
" print(f\"✔ Extracted {len(links)} raw links from {eco_name}\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Extracting issues and links data from database...\n",
"Working on ecosystem: RedHat ...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"502297it [00:43, 11442.78it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"✔ Extracted 502297 raw issues with 1115471 comments from RedHat\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"502297it [00:38, 13038.55it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"✔ Extracted 405070 raw links from RedHat\n",
"✔ RedHat completely processed. Duration: 00:01:31\n",
"\n",
"✅ All completely processed. Total duration: 00:01:31\n"
]
}
],
"source": [
"with MongoClient() as client:\n",
" start_time = time() # 记录总处理时间\n",
" db = client[\"JiraEcos\"]\n",
" print(\"Extracting issues and links data from database...\")\n",
"\n",
" for eco_name in ECO_NAMES:\n",
" eco_start_time = time() # 记录提取每个生态数据的时间\n",
" print(f\"Working on ecosystem: {eco_name} ...\")\n",
"\n",
" extract_issues_to_csv(eco_name, db)\n",
" extract_links_to_csv(eco_name, db)\n",
"\n",
" print(\n",
" f\"✔ {eco_name} completely processed. Duration: {format_duration(eco_start_time, time())}\"\n",
" )\n",
" print(\"\")\n",
"\n",
" print(\n",
" f\"✅ All completely processed. Total duration: {format_duration(start_time, time())}\"\n",
" )"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "grad_pro_env",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,589 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "1345f1c8",
"metadata": {},
"outputs": [],
"source": [
"from tqdm import tqdm\n",
"from pathlib import Path\n",
"import pandas as pd\n",
"import numpy as np\n",
"from pymongo import MongoClient"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "46d49dd9",
"metadata": {},
"outputs": [],
"source": [
"# 软件生态名\n",
"ECO_NAMES = [\n",
" # \"Apache\",\n",
" # \"Jira\",\n",
" # \"Mojang\",\n",
" # \"MongoDB\",\n",
" # \"Qt\",\n",
" \"RedHat\",\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "03cf285e",
"metadata": {},
"outputs": [],
"source": [
"ISSUE_DIR = Path(\"../data/raw/issues\")\n",
"LINK_DIR = Path(\"../data/raw/links\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "61e8f96a",
"metadata": {},
"outputs": [],
"source": [
"PRO_ISSUE_DIR = Path(\"../data/processed/issues\")\n",
"PRO_ISSUE_DIR.mkdir(parents=True, exist_ok=True)\n",
"PRO_LINK_DIR = Path(\"../data/processed/links\")\n",
"PRO_LINK_DIR.mkdir(parents=True, exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "40885839",
"metadata": {},
"outputs": [],
"source": [
"def load_issues(eco_name: str):\n",
" # 加载Issue数据DataFrame\n",
"\n",
" filename = ISSUE_DIR / (eco_name + \".csv\")\n",
" issue_df = pd.read_csv(\n",
" filename, sep=\";\", encoding=\"utf-8\", low_memory=False, index_col=[\"key\"]\n",
" )\n",
" return issue_df"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "1981c072",
"metadata": {},
"outputs": [],
"source": [
"def load_links(eco_name: str):\n",
" # 加载链接数据DataFrame\n",
"\n",
" filename = LINK_DIR / (eco_name + \".csv\")\n",
" link_df = pd.read_csv(filename, sep=\";\", encoding=\"utf-8\", low_memory=False)\n",
" return link_df"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "865088c5",
"metadata": {},
"outputs": [],
"source": [
"def clean_issues(issue_df: pd.DataFrame):\n",
" # 对Issue数据进行清洗\n",
"\n",
" # 把时间数据转换为统一格式\n",
" issue_df[\"created_time\"] = pd.to_datetime(\n",
" issue_df[\"created_time\"], errors=\"coerce\"\n",
" ).apply(lambda x: x.strftime(\"%Y-%m-%d %H:%M:%S\") if pd.notna(x) else np.nan)\n",
"\n",
" return issue_df"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "4447a7d2",
"metadata": {},
"outputs": [],
"source": [
"def clean_links(link_df: pd.DataFrame, issue_df: pd.DataFrame):\n",
" # 对链接数据进行清洗\n",
"\n",
" def column_transform(row):\n",
" return str(sorted(set([row[\"in_issue_key\"], row[\"out_issue_key\"]])))\n",
"\n",
" # 一条一般类型链接会在两个Issue的字段中存在需要清除其中一个副本\n",
" link_df.drop_duplicates(inplace=True)\n",
" print(f\"Left with {len(link_df)} links after removing link duplication\")\n",
"\n",
" # 清除Issue是私有的、无权访问的链接\n",
" condition = (\n",
" link_df[[\"in_issue_key\", \"out_issue_key\"]]\n",
" .isin(issue_df.index.values)\n",
" .all(axis=1)\n",
" )\n",
" link_df = link_df[condition]\n",
" print(f\"Left with {len(link_df)} links after removing half-private issues\")\n",
"\n",
" # 一对Issue间只允许存在一条链接需要删除含有多条链接的Issue对\n",
" # 首先基于'link_key'字段删除重复的Issue对\n",
" # !注意:相同的'link_key'的Issue对之间是可能存在多种类型的链接这会混淆关联关系所以全部清除\n",
" link_df.drop_duplicates(subset=[\"link_key\"], keep=False, inplace=True)\n",
"\n",
" # 其次,以防'link_key'是反过来的比如issue1_issue2和issue2_issue1\n",
" # 所以添加'sorted_issue_keys'字段由链接的两个Issue的key升序组成\n",
" link_df[\"sorted_issue_keys\"] = link_df.apply(column_transform, axis=1)\n",
" # 找出链接的两端Issue的keys相同的行对应的'sorted_issue_keys'字段值\n",
" doublelinks = (\n",
" (link_df[\"sorted_issue_keys\"].value_counts() > 1)\n",
" .rename_axis(\"doubles\")\n",
" .reset_index(name=\"valid\")\n",
" )\n",
" valid_double_keys = set(doublelinks[doublelinks[\"valid\"] == True][\"doubles\"])\n",
"\n",
" # 把重复的'sorted_issue_keys'字段对应的链接类型取出来检查若类型数大于1则清除这些Issue对\n",
" for i in tqdm(valid_double_keys):\n",
" if len(set(link_df[link_df[\"sorted_issue_keys\"] == i][\"link_type\"])) > 1:\n",
" condition = link_df[\"sorted_issue_keys\"] != i\n",
" link_df = link_df[condition]\n",
" print(\n",
" f\"Left with {len(link_df)} links after removing issue-pairs with multiple types of links between them\"\n",
" )\n",
"\n",
" # 最后留下来的链接中仍然可能有重复链接类型的Issue对通过Issue的key对调的方式实现的清除其中一个\n",
" link_df.drop_duplicates(subset=[\"sorted_issue_keys\"], inplace=True)\n",
" print(\n",
" f\"Left with {len(link_df)} links after removing issue-pairs with duplicate same type of links\"\n",
" )\n",
"\n",
" link_df.reset_index(inplace=True, drop=True)\n",
"\n",
" return link_df"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "ab8df842",
"metadata": {},
"outputs": [],
"source": [
"def joined_links(link_df: pd.DataFrame, issue_df: pd.DataFrame):\n",
" # 联合Issue和链接数据\n",
"\n",
" joined_df = link_df.join(issue_df.add_suffix(\"_in\"), on=\"in_issue_key\").join(\n",
" issue_df.add_suffix(\"_out\"), on=\"out_issue_key\"\n",
" )\n",
"\n",
" # !注意补充Subtask类型链接创建时间\n",
" joined_df.loc[\n",
" (joined_df[\"link_type\"] == \"Subtask\") & (joined_df[\"link_created_time\"].isna()),\n",
" \"link_created_time\",\n",
" ] = joined_df[\"created_time_out\"]\n",
"\n",
" return joined_df"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "7a496e37",
"metadata": {},
"outputs": [],
"source": [
"def query_issue_closed_time(eco_name: str, issue_df: pd.DataFrame):\n",
" # 查询Issue的history获取Issue关闭时间\n",
"\n",
" # 定义一个函数,用于处理每个分组\n",
" def handle_group(group):\n",
" # 如果分组内的closed_time全为NaN则保留该分组的第一行\n",
" if group[\"closed_time\"].isna().all():\n",
" return group.iloc[0:1]\n",
" # 否则返回closed_time最大值对应的行\n",
" else:\n",
" return group.loc[[group[\"closed_time\"].idxmax()]]\n",
"\n",
" # 把索引列转换为普通列列名为key\n",
" issue_df = issue_df.reset_index().rename(columns={\"index\": \"key\"})\n",
" # 创建Issue关闭时间列\n",
" issue_df[\"closed_time\"] = None\n",
"\n",
" with MongoClient() as client:\n",
" # 链接数据库\n",
" db = client[\"JiraEcos\"]\n",
" histories_collection = db[eco_name + \"Histories\"]\n",
"\n",
" # 首先取出需要查询的Issue keys并移除重复项\n",
" issue_keys = issue_df[\"key\"].unique().tolist()\n",
"\n",
" # 构造聚合查询管道\n",
" pipeline = [\n",
" {\n",
" # 第一步筛选key在issue_keys列表中的文档\n",
" \"$match\": {\"key\": {\"$in\": issue_keys}}\n",
" },\n",
" {\n",
" # 第二步展开history.items数组\n",
" # 进而返回每个具体更改事件\n",
" \"$unwind\": \"$history.items\"\n",
" },\n",
" {\n",
" # 第三步再次筛选满足特定field值的展开后的文档\n",
" # field字段为status保证更改事件是修改Issue状态\n",
" # toString字段为Closed保证是关闭Issue\n",
" \"$match\": {\n",
" \"history.items.field\": \"status\",\n",
" \"history.items.toString\": \"Closed\",\n",
" }\n",
" },\n",
" {\n",
" # 第四步:指定返回文档的字段\n",
" \"$project\": {\n",
" \"_id\": 0,\n",
" \"query_key\": \"$key\",\n",
" \"created\": \"$history.created\",\n",
" \"field\": \"$history.items.field\",\n",
" \"to\": \"$history.items.to\",\n",
" \"toString\": \"$history.items.toString\",\n",
" }\n",
" },\n",
" ]\n",
"\n",
" # 查询数据库\n",
" query = list(histories_collection.aggregate(pipeline))\n",
" # 转换为DataFrame\n",
" query_df = pd.DataFrame(query)\n",
" # print(query_df.head())\n",
"\n",
" print(\n",
" f\"❕ Test print: {len(issue_df)} issues before merged with query DataFrame\"\n",
" )\n",
"\n",
" # 合并DataFrame基于key与query_key匹配\n",
" merged_df = pd.merge(\n",
" issue_df,\n",
" query_df,\n",
" left_on=\"key\",\n",
" right_on=\"query_key\",\n",
" how=\"left\",\n",
" )\n",
"\n",
" print(\n",
" f\"❕ Test print: {len(merged_df)} issues after merged with query DataFrame\"\n",
" )\n",
"\n",
" # 将merged_df中的created值赋给合并后DataFrame的closed_time字段\n",
" merged_df[\"closed_time\"] = pd.to_datetime(merged_df[\"created\"], errors=\"coerce\")\n",
"\n",
" # 裁切出需要的字段\n",
" issue_df = merged_df[list(issue_df.columns)]\n",
"\n",
" # 最后由于Issue可能会被多次开启与关闭所以保留最后一次关闭时间\n",
" # 按照key进行分组使用groupby和apply处理每个分组\n",
" result_df = (\n",
" issue_df.groupby(\"key\", as_index=False)\n",
" .apply(handle_group)\n",
" .reset_index(drop=True)\n",
" )\n",
" # 统一时间格式\n",
" result_df[\"closed_time\"] = result_df[\"closed_time\"].apply(\n",
" lambda x: x.strftime(\"%Y-%m-%d %H:%M:%S\") if pd.notna(x) else np.nan\n",
" )\n",
" # 把key列重新设置为索引列\n",
" result_df = result_df.set_index(\"key\")\n",
"\n",
" print(f\"❕ Test print: {len(result_df)} issues after processed done\")\n",
"\n",
" return result_df"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "3401742e",
"metadata": {},
"outputs": [],
"source": [
"def query_link_created_time(eco_name: str, link_df: pd.DataFrame):\n",
" # 查询Issue的history获取链接创建时间\n",
"\n",
" # 定义一个函数,用于处理每个分组\n",
" def handle_group(group):\n",
" # 如果分组内的link_created_time全为NaN则保留该分组的第一行\n",
" if group[\"link_created_time\"].isna().all():\n",
" return group.iloc[0:1]\n",
" # 否则返回link_created_time最大值对应的行\n",
" else:\n",
" return group.loc[[group[\"link_created_time\"].idxmax()]]\n",
"\n",
" # 裁切出需要的字段\n",
" link_df = link_df[[\"link_type\", \"in_issue_key\", \"out_issue_key\"]]\n",
" # 创建链接创建时间列\n",
" link_df[\"link_created_time\"] = None\n",
"\n",
" with MongoClient() as client:\n",
" # 链接数据库\n",
" db = client[\"JiraEcos\"]\n",
" histories_collection = db[eco_name + \"Histories\"]\n",
"\n",
" # 首先取出需要查询的Issue keys并移除重复项\n",
" out_issue_keys = link_df[\"out_issue_key\"].tolist()\n",
" out_issue_keys = list(set(out_issue_keys))\n",
"\n",
" # 构造聚合查询管道\n",
" pipeline = [\n",
" {\n",
" # 第一步筛选key在out_issue_keys列表中的文档\n",
" # 从而保证只取出有链接的Issue的更改事件\n",
" \"$match\": {\"key\": {\"$in\": out_issue_keys}}\n",
" },\n",
" {\n",
" # 第二步展开history.items数组\n",
" # 进而返回每个具体更改事件\n",
" \"$unwind\": \"$history.items\"\n",
" },\n",
" {\n",
" # 第三步再次筛选满足特定field值的展开后的文档\n",
" # field保证更改事件是链接创建或删除\n",
" # to或toString字段不为空保证是创建链接的事件而不是删除\n",
" \"$match\": {\n",
" \"history.items.field\": {\n",
" \"$in\": [\"Link\", \"Epic Child\", \"Parent\", \"Parent Issue\"]\n",
" },\n",
" \"$or\": [\n",
" {\"history.items.to\": {\"$ne\": None}},\n",
" {\"history.items.toString\": {\"$ne\": None}},\n",
" ],\n",
" }\n",
" },\n",
" {\n",
" # 第四步:指定返回文档的格式\n",
" # target_key根据链接类型获取to或toString字段信息\n",
" \"$project\": {\n",
" \"_id\": 0,\n",
" \"key\": 1,\n",
" \"created\": \"$history.created\",\n",
" \"field\": \"$history.items.field\",\n",
" \"to\": \"$history.items.to\",\n",
" \"toString\": \"$history.items.toString\",\n",
" \"target_key\": {\n",
" \"$cond\": {\n",
" \"if\": {\"$eq\": [\"$history.items.field\", \"Link\"]},\n",
" \"then\": \"$history.items.to\",\n",
" \"else\": \"$history.items.toString\",\n",
" }\n",
" },\n",
" }\n",
" },\n",
" ]\n",
"\n",
" # 查询数据库\n",
" query = list(histories_collection.aggregate(pipeline))\n",
" # 转换为DataFrame\n",
" query_df = pd.DataFrame(query)\n",
" # print(query_df.head())\n",
"\n",
" # 合并DataFrame基于out_issue_key和key匹配in_issue_key和target_key匹配\n",
" merged_df = pd.merge(\n",
" link_df,\n",
" query_df,\n",
" left_on=[\"out_issue_key\", \"in_issue_key\"],\n",
" right_on=[\"key\", \"target_key\"],\n",
" how=\"left\",\n",
" )\n",
"\n",
" # 将merged_df中的created值赋给合并后DataFrame的link_created_time字段\n",
" merged_df[\"link_created_time\"] = merged_df[\"created\"]\n",
"\n",
" # 裁切出需要的字段\n",
" link_df = merged_df[\n",
" [\"link_type\", \"in_issue_key\", \"out_issue_key\", \"link_created_time\"]\n",
" ]\n",
"\n",
" # 最后由于in_issue和out_issue之间可能会发生相同类型链接的多次创建活动\n",
" # 所以,保留最后一次链接创建时间\n",
" # 转换link_created_time为datetime以确保比较的准确性\n",
" link_df[\"link_created_time\"] = pd.to_datetime(\n",
" link_df[\"link_created_time\"], errors=\"coerce\"\n",
" )\n",
"\n",
" # 按照除link_created_time以外的所有字段进行分组使用groupby和apply处理每个分组\n",
" result_df = (\n",
" link_df.groupby(\n",
" [\"link_type\", \"in_issue_key\", \"out_issue_key\"], as_index=False\n",
" )\n",
" .apply(handle_group)\n",
" .reset_index(drop=True)\n",
" )\n",
" # 统一时间格式\n",
" result_df[\"link_created_time\"] = result_df[\"link_created_time\"].apply(\n",
" lambda x: x.strftime(\"%Y-%m-%d %H:%M:%S\") if pd.notna(x) else np.nan\n",
" )\n",
"\n",
" return result_df"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "b42827a1",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"✔ Loaded 502297 raw issues and 405070 raw links for RedHat\n",
"Left with 268935 links after removing link duplication\n",
"Left with 249733 links after removing half-private issues\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_523178/4239802332.py:23: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" link_df.drop_duplicates(subset=[\"link_key\"], keep=False, inplace=True)\n",
"/tmp/ipykernel_523178/4239802332.py:27: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" link_df[\"sorted_issue_keys\"] = link_df.apply(column_transform, axis=1)\n",
"100%|██████████| 4004/4004 [01:48<00:00, 36.81it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Left with 238349 links after removing issue-pairs with multiple types of links between them\n",
"Left with 238053 links after removing issue-pairs with duplicate same type of links\n",
"✔ Cleaned 502297 issues for RedHat\n",
"✔ Cleaned 238053 links for RedHat\n",
"✔ Link type distribution:\n",
"link_type\n",
"Epic 67799\n",
"Subtask 45020\n",
"Related 44222\n",
"Cloners 29629\n",
"Blocks 21106\n",
"Incorporates 12847\n",
"Duplicate 7080\n",
"Causality 4122\n",
"Depend 2849\n",
"Document 1652\n",
"Issue split 694\n",
"Account 568\n",
"Triggers 465\n",
"Name: count, dtype: int64\n",
"❕ Test print: 502297 issues before merged with query DataFrame\n",
"❕ Test print: 561112 issues after merged with query DataFrame\n",
"❕ Test print: 502297 issues after processed done\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_523178/834440382.py:16: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" link_df[\"link_created_time\"] = None\n",
"/tmp/ipykernel_523178/834440382.py:100: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" link_df[\"link_created_time\"] = pd.to_datetime(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"✅ ----------------------------\n",
"\n"
]
}
],
"source": [
"for eco_name in ECO_NAMES:\n",
" # 加载Issue和链接数据DataFrame\n",
" issue_df = load_issues(eco_name)\n",
" link_df = load_links(eco_name)\n",
" print(\n",
" f\"✔ Loaded {len(issue_df)} raw issues and {len(link_df)} raw links for {eco_name}\"\n",
" )\n",
"\n",
" # 对Issue和链接数据进行清理\n",
" issue_df = clean_issues(issue_df)\n",
" link_df = clean_links(link_df, issue_df)\n",
" print(f\"✔ Cleaned {len(issue_df)} issues for {eco_name}\")\n",
" print(f\"✔ Cleaned {len(link_df)} links for {eco_name}\")\n",
"\n",
" # 打印不同链接类型分布\n",
" print(\"✔ Link type distribution:\")\n",
" print(link_df[\"link_type\"].value_counts())\n",
"\n",
" # 添加Issue关闭时间\n",
" issue_df = query_issue_closed_time(eco_name, issue_df)\n",
"\n",
" # 添加链接创建时间\n",
" link_df = query_link_created_time(eco_name, link_df)\n",
"\n",
" # 联合Issue和链接数据\n",
" link_df = joined_links(link_df, issue_df)\n",
"\n",
" # 保存清理后的Issue和链接数据\n",
" issue_df.to_csv(\n",
" PRO_ISSUE_DIR / (eco_name + \".csv\"),\n",
" sep=\";\",\n",
" index=True, #! issue_df的key被设置为了索引列所以这里需要保存\n",
" )\n",
" link_df.to_csv(\n",
" PRO_LINK_DIR / (eco_name + \".csv\"),\n",
" sep=\";\",\n",
" index=False,\n",
" )\n",
"\n",
" print(\"✅ ----------------------------\\n\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long