github_mongoDB_dataprocess/factor_analysis.py

398 lines
20 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from pymongo import MongoClient
from datetime import datetime
import matplotlib.pyplot as plt
import re
import matplotlib
import numpy as np
#计算时间差值,输出为秒数
def get_time_difference(t_old,t_new):
time_diff_seconds = (t_new-t_old).total_seconds()
return time_diff_seconds
#计算列表元素时间差
def get_time_difference_list(time_list):
interval = list()
for i in range(1,len(time_list)):
interval.append(get_time_difference(time_list[i-1],time_list[i]))
return interval
#连接数据库
client = MongoClient('localhost', 27017)
db = client['numpy_db']
collection1 = db['issue_pr']
collection2 = db['issue_timeline_new']
#已关闭的issue信息
issue_data = collection1.find({"state":"closed", "pull_request": {"$exists": False}},
{ "number": 1, "created_at": 1, "closed_at":1,"body":1,"title":1, "_id": 0 }
).sort([("number", 1)]).batch_size(10)
#数据初始化
first_interval = [0]*30
last_interval = [0]*30
all_interval = [0]*30
first_all = [0]*30
last_all = [0]*30
# count = [0]*30
time = [0]*30
mentioned_without_time = 0
mentioned_without_count = 0
assigned_without_time = 0
assigned_without_count = 0
cross_referenced_without_time = 0
cross_referenced_without_count = 0
labeled_without_time = 0
labeled_without_count = 0
link_without_time = 0
link_without_count = 0
for i in range(1,21):
locals()['x_'+str(i)] = list()
locals()['y_'+str(i)] = list()
i = 0
for issue in issue_data:
print(issue['number'], end=' ')
#获取五种事件类型的时间戳:@、assigned、labeled、linkcross-reference
mentioned_data = collection2.find(
{ "event": "mentioned", "issue_number": issue['number']},
{"created_at": 1, "_id": 0 }).sort([("created_at", 1)])
mentioned_time = [datetime.strptime(event["created_at"], '%Y-%m-%dT%H:%M:%SZ') for event in mentioned_data]
mentioned_time.sort()
assigned_data = collection2.find(
{ "event": "assigned", "issue_number": issue['number']},
{"created_at": 1, "_id": 0 }).sort([("created_at", 1)])
assigned_time = [datetime.strptime(event["created_at"], '%Y-%m-%dT%H:%M:%SZ') for event in assigned_data]
assigned_time.sort()
cross_referenced_data = collection2.find(
{"event": "cross-referenced", "issue_number": issue['number']},
{"created_at": 1, "_id": 0}).sort([("created_at", 1)])
cross_referenced_time = [datetime.strptime(event["created_at"], '%Y-%m-%dT%H:%M:%SZ') for event in cross_referenced_data]
cross_referenced_time.sort()
labeled_data = collection2.find(
{"event": "labeled", "issue_number": issue['number']},
{"created_at": 1, "_id": 0}).sort([("created_at", 1)])
labeled_time = [datetime.strptime(event["created_at"], '%Y-%m-%dT%H:%M:%SZ') for event in labeled_data]
labeled_time.sort()
commented_data = collection2.find(
{"event": "commented", "issue_number": issue['number']},
{"created_at": 1,"body":1, "_id": 0}).sort([("created_at", 1)])
pattern = r'github.com/[a-zA-Z0-9-_.]+/[a-zA-Z0-9-_.]+/issues/[0-9]+|github.com/[a-zA-Z0-9-_.]+/[a-zA-Z0-9-_.]+/pull/[0-9]+'
link_time = []
for event in commented_data:
num = len(re.findall(pattern,event['body']))
for j in range(num):
link_time.append(datetime.strptime(event["created_at"], '%Y-%m-%dT%H:%M:%SZ'))
if issue['body']:
for j in range(len(re.findall(pattern, issue['body']))):
link_time.append(datetime.strptime(issue["created_at"], '%Y-%m-%dT%H:%M:%SZ'))
if issue['title']:
for j in range(len(re.findall(pattern, issue['title']))):
link_time.append(datetime.strptime(issue["created_at"], '%Y-%m-%dT%H:%M:%SZ'))
# link_time = [datetime.strptime(event["created_at"], '%Y-%m-%dT%H:%M:%SZ') for event in commented_data if re.findall(pattern,event['body'])]
link_time.sort()
#五种时间类型分别计算时间差,综合事件类型时间差
mentioned_time.insert(0,datetime.strptime(issue["created_at"], '%Y-%m-%dT%H:%M:%SZ'))#添加issue创建、关闭的时间戳
mentioned_time.append(datetime.strptime(issue["closed_at"], '%Y-%m-%dT%H:%M:%SZ'))
for time in mentioned_time[0:-1]: #删除发生在issue关闭后的事件
if get_time_difference(time,mentioned_time[-1])<0:
mentioned_time.remove(time)
mentioned_interval_list = get_time_difference_list(mentioned_time)
if all(x < 10 for x in mentioned_interval_list): #删除机器人触发的事件
continue
assigned_time.insert(0,datetime.strptime(issue["created_at"], '%Y-%m-%dT%H:%M:%SZ')) #添加issue创建、关闭的时间戳
assigned_time.append(datetime.strptime(issue["closed_at"], '%Y-%m-%dT%H:%M:%SZ'))
for time in assigned_time[0:-1]: #删除发生在issue关闭后的事件
if get_time_difference(time,assigned_time[-1])<0:
assigned_time.remove(time)
assigned_interval_list = get_time_difference_list(assigned_time)
if all(x < 10 for x in assigned_interval_list): #删除机器人触发的事件
continue
cross_referenced_time.insert(0,datetime.strptime(issue["created_at"], '%Y-%m-%dT%H:%M:%SZ')) #添加issue创建、关闭的时间戳
cross_referenced_time.append(datetime.strptime(issue["closed_at"], '%Y-%m-%dT%H:%M:%SZ'))
for time in cross_referenced_time[0:-1]: #删除发生在issue关闭后的事件
if get_time_difference(time,cross_referenced_time[-1])<0:
cross_referenced_time.remove(time)
cross_referenced_interval_list = get_time_difference_list(cross_referenced_time)
if all(x < 10 for x in cross_referenced_interval_list): #删除机器人触发的事件
continue
labeled_time.insert(0,datetime.strptime(issue["created_at"], '%Y-%m-%dT%H:%M:%SZ')) #添加issue创建、关闭的时间戳
labeled_time.append(datetime.strptime(issue["closed_at"], '%Y-%m-%dT%H:%M:%SZ'))
for time in labeled_time[0:-1]: #删除发生在issue关闭后的事件
if get_time_difference(time,labeled_time[-1])<0:
labeled_time.remove(time)
labeled_interval_list = get_time_difference_list(labeled_time)
if all(x < 10 for x in labeled_interval_list): #删除机器人触发的事件
continue
link_time.insert(0,datetime.strptime(issue["created_at"], '%Y-%m-%dT%H:%M:%SZ')) #添加issue创建、关闭的时间戳
link_time.append(datetime.strptime(issue["closed_at"], '%Y-%m-%dT%H:%M:%SZ'))
for time in link_time[0:-1]: #删除发生在issue关闭后的事件
if get_time_difference(time,link_time[-1])<0:
link_time.remove(time)
link_interval_list = get_time_difference_list(link_time)
if all(x < 10 for x in link_interval_list): #删除机器人触发的事件
continue
if len(mentioned_interval_list) == 1:
mentioned_without_time += mentioned_interval_list[0]
mentioned_without_count += 1
elif len(mentioned_interval_list) == 2:
locals()['x_'+str(len(mentioned_interval_list)-1)].append(mentioned_interval_list[0]/3600/24)
locals()['y_'+str(len(mentioned_interval_list)-1)].append(get_time_difference(mentioned_time[1],mentioned_time[-1])/3600/24)
locals()['x_'+str(len(mentioned_interval_list))].append(mentioned_interval_list[0]/3600/24)
locals()['y_' + str(len(mentioned_interval_list))].append(
get_time_difference(mentioned_time[0], mentioned_time[-1]) / 3600 / 24)
elif len(mentioned_interval_list) == 3:
locals()['x_' + str(len(mentioned_interval_list))].append(mentioned_interval_list[0] / 3600 / 24)
locals()['y_' + str(len(mentioned_interval_list))].append(
get_time_difference(mentioned_time[1], mentioned_time[-1]) / 3600 / 24)
locals()['x_' + str(len(mentioned_interval_list) + 1)].append(mentioned_interval_list[0] / 3600 / 24)
locals()['y_' + str(len(mentioned_interval_list) + 1)].append(mentioned_interval_list[-1]/3600/24)
if len(assigned_interval_list) == 1:
assigned_without_time += assigned_interval_list[0]
assigned_without_count += 1
elif len(assigned_interval_list) == 2:
locals()['x_'+str(len(assigned_interval_list)+3)].append(assigned_interval_list[0]/3600/24)
locals()['y_'+str(len(assigned_interval_list)+3)].append(get_time_difference(assigned_time[1],assigned_time[-1])/3600/24)
locals()['x_'+str(len(assigned_interval_list)+4)].append(assigned_interval_list[0]/3600/24)
locals()['y_' + str(len(assigned_interval_list) + 4)].append(
get_time_difference(assigned_time[0], assigned_time[-1]) / 3600 / 24)
elif len(assigned_interval_list) == 3:
locals()['x_' + str(len(assigned_interval_list) + 4)].append(assigned_interval_list[0] / 3600 / 24)
locals()['y_' + str(len(assigned_interval_list) + 4)].append(
get_time_difference(assigned_time[1], assigned_time[-1]) / 3600 / 24)
locals()['x_' + str(len(assigned_interval_list) + 5)].append(assigned_interval_list[0] / 3600 / 24)
locals()['y_' + str(len(assigned_interval_list) + 5)].append(assigned_interval_list[-1]/3600/24)
if len(cross_referenced_interval_list) == 1:
cross_referenced_without_time += cross_referenced_interval_list[0]
cross_referenced_without_count += 1
elif len(cross_referenced_interval_list) == 2:
locals()['x_'+str(len(cross_referenced_interval_list)+7)].append(cross_referenced_interval_list[0]/3600/24)
locals()['y_'+str(len(cross_referenced_interval_list)+7)].append(get_time_difference(cross_referenced_time[1],cross_referenced_time[-1])/3600/24)
locals()['x_'+str(len(cross_referenced_interval_list)+8)].append(cross_referenced_interval_list[0]/3600/24)
locals()['y_' + str(len(cross_referenced_interval_list) + 8)].append(
get_time_difference(cross_referenced_time[0], cross_referenced_time[-1]) / 3600 / 24)
elif len(cross_referenced_interval_list) == 3:
locals()['x_' + str(len(cross_referenced_interval_list) + 8)].append(
cross_referenced_interval_list[0] / 3600 / 24)
locals()['y_' + str(len(cross_referenced_interval_list) + 8)].append(
get_time_difference(cross_referenced_time[1], cross_referenced_time[-1]) / 3600 / 24)
locals()['x_' + str(len(cross_referenced_interval_list) + 9)].append(
cross_referenced_interval_list[0] / 3600 / 24)
locals()['y_' + str(len(cross_referenced_interval_list) + 9)].append(cross_referenced_interval_list[-1]/3600/24)
if len(labeled_interval_list) == 1:
labeled_without_time += labeled_interval_list[0]
labeled_without_count += 1
elif len(labeled_interval_list) == 2:
locals()['x_'+str(len(labeled_interval_list)+11)].append(labeled_interval_list[0]/3600/24)
locals()['y_'+str(len(labeled_interval_list)+11)].append(get_time_difference(labeled_time[1],labeled_time[-1])/3600/24)
locals()['x_'+str(len(labeled_interval_list)+12)].append(labeled_interval_list[0]/3600/24)
locals()['y_' + str(len(labeled_interval_list) + 12)].append(
get_time_difference(labeled_time[0], labeled_time[-1]) / 3600 / 24)
elif len(labeled_interval_list) == 3:
locals()['x_' + str(len(labeled_interval_list) + 12)].append(labeled_interval_list[0] / 3600 / 24)
locals()['y_' + str(len(labeled_interval_list) + 12)].append(
get_time_difference(labeled_time[1], labeled_time[-1]) / 3600 / 24)
locals()['x_' + str(len(labeled_interval_list) + 13)].append(labeled_interval_list[0] / 3600 / 24)
locals()['y_' + str(len(labeled_interval_list) + 13)].append(labeled_interval_list[-1]/3600/24)
if len(link_interval_list) == 1:
link_without_time += link_interval_list[0]
link_without_count += 1
elif len(link_interval_list) == 2:
locals()['x_'+str(len(link_interval_list)+15)].append(link_interval_list[0]/3600/24)
locals()['y_' + str(len(link_interval_list) + 15)].append(get_time_difference(link_time[1], link_time[-1])/3600/24)
locals()['x_' + str(len(link_interval_list) + 16)].append(link_interval_list[0]/3600/24)
locals()['y_' + str(len(link_interval_list) + 16)].append(get_time_difference(link_time[0],link_time[-1])/3600/24)
elif len(link_interval_list) == 3:
locals()['x_' + str(len(link_interval_list) + 16)].append(link_interval_list[0] / 3600 / 24)
locals()['y_' + str(len(link_interval_list) + 16)].append(
get_time_difference(link_time[1], link_time[-1]) / 3600 / 24)
locals()['x_' + str(len(link_interval_list) + 17)].append(link_interval_list[0] / 3600 / 24)
locals()['y_' + str(len(link_interval_list) + 17)].append(link_interval_list[-1]/3600/24)
# #绘制散点图
matplotlib.rcParams['font.family'] = 'SimHei' # 或者其他你有的中文字体
matplotlib.rcParams['axes.unicode_minus'] = False # 正确显示负号
fig1, ax1 = plt.subplots(2, 2)
fig1.suptitle('mentioned')
line1 = mentioned_without_time/mentioned_without_count/3600/24
ax1[0,0].scatter(x_1,y_1,s=1)
ax1[0,0].axhline(y=line1,color='red',linestyle='--')
ax1[0,0].set_title('事件发生一次')
ax1[0,0].set_xlabel('issue开启至事件发生时间')
ax1[0,0].set_ylabel('事件发生至issue关闭时间')
ax1[0,1].scatter(x_2,y_2,s=1)
ax1[0,1].axhline(y=line1,color='red',linestyle='--')
ax1[0,1].set_title('事件发生一次')
ax1[0,1].set_xlabel('issue开启至事件发生时间')
ax1[0,1].set_ylabel('issue开启至issue关闭时间')
ax1[1,0].scatter(x_3,y_3,s=1)
ax1[1,0].axhline(y=line1,color='red',linestyle='--')
ax1[1,0].set_title('事件发生两次')
ax1[1,0].set_xlabel('issue开启至第一次事件发生时间')
ax1[1,0].set_ylabel('事件第一次发生启至issue关闭时间')
ax1[1,1].scatter(x_4,y_4,s=1)
ax1[1,1].axhline(y=line1,color='red',linestyle='--')
ax1[1,1].set_title('事件发生两次')
ax1[1,1].set_xlabel('issue开启至第一次事件发生时间')
ax1[1,1].set_ylabel('事件第二次发生启至issue关闭时间')
fig2, ax2 = plt.subplots(2, 2)
fig2.suptitle('assigned')
line2 = assigned_without_time/assigned_without_count/3600/24
ax2[0,0].scatter(x_5,y_5,s=1)
ax2[0,0].axhline(y=line2,color='red',linestyle='--')
ax2[0,0].set_title('事件发生一次')
ax2[0,0].set_xlabel('issue开启至事件发生时间')
ax2[0,0].set_ylabel('事件发生至issue关闭时间')
ax2[0,1].scatter(x_6,y_6,s=1)
ax2[0,1].axhline(y=line2,color='red',linestyle='--')
ax2[0,1].set_title('事件发生一次')
ax2[0,1].set_xlabel('issue开启至事件发生时间')
ax2[0,1].set_ylabel('issue开启至issue关闭时间')
ax2[1,0].scatter(x_7,y_7,s=1)
ax2[1,0].axhline(y=line2,color='red',linestyle='--')
ax2[1,0].set_title('事件发生两次')
ax2[1,0].set_xlabel('issue开启至第一次事件发生时间')
ax2[1,0].set_ylabel('事件第一次发生启至issue关闭时间')
ax2[1,1].scatter(x_8,y_8,s=1)
ax2[1,1].axhline(y=line2,color='red',linestyle='--')
ax2[1,1].set_title('事件发生两次')
ax2[1,1].set_xlabel('issue开启至第一次事件发生时间')
ax2[1,1].set_ylabel('事件第二次发生启至issue关闭时间')
fig3, ax3 = plt.subplots(2, 2)
fig3.suptitle('cross-referenced')
line3 = cross_referenced_without_time/cross_referenced_without_count/3600/24
ax3[0,0].scatter(x_9,y_9,s=1)
ax3[0,0].axhline(y=line3,color='red',linestyle='--')
ax3[0,0].set_title('事件发生一次')
ax3[0,0].set_xlabel('issue开启至事件发生时间')
ax3[0,0].set_ylabel('事件发生至issue关闭时间')
ax3[0,1].scatter(x_10,y_10,s=1)
ax3[0,1].axhline(y=line3,color='red',linestyle='--')
ax3[0,1].set_title('事件发生一次')
ax3[0,1].set_xlabel('issue开启至事件发生时间')
ax3[0,1].set_ylabel('issue开启至issue关闭时间')
ax3[1,0].scatter(x_11,y_11,s=1)
ax3[1,0].axhline(y=line3,color='red',linestyle='--')
ax3[1,0].set_title('事件发生两次')
ax3[1,0].set_xlabel('issue开启至第一次事件发生时间')
ax3[1,0].set_ylabel('事件第一次发生启至issue关闭时间')
ax3[1,1].scatter(x_12,y_12,s=1)
ax3[1,1].axhline(y=line3,color='red',linestyle='--')
ax3[1,1].set_title('事件发生两次')
ax3[1,1].set_xlabel('issue开启至第一次事件发生时间')
ax3[1,1].set_ylabel('事件第二次发生启至issue关闭时间')
fig4, ax4 = plt.subplots(2, 2)
fig4.suptitle('labeled')
line4 = labeled_without_time/labeled_without_count/3600/24
ax4[0,0].scatter(x_13,y_13,s=1)
ax4[0,0].axhline(y=line4,color='red',linestyle='--')
ax4[0,0].set_title('事件发生一次')
ax4[0,0].set_xlabel('issue开启至事件发生时间')
ax4[0,0].set_ylabel('事件发生至issue关闭时间')
ax4[0,1].scatter(x_14,y_14,s=1)
ax4[0,1].axhline(y=line4,color='red',linestyle='--')
ax4[0,1].set_title('事件发生一次')
ax4[0,1].set_xlabel('issue开启至事件发生时间')
ax4[0,1].set_ylabel('issue开启至issue关闭时间')
ax4[1,0].scatter(x_15,y_15,s=1)
ax4[1,0].axhline(y=line4,color='red',linestyle='--')
ax4[1,0].set_title('事件发生两次')
ax4[1,0].set_xlabel('issue开启至第一次事件发生时间')
ax4[1,0].set_ylabel('事件第一次发生启至issue关闭时间')
ax4[1,1].scatter(x_16,y_16,s=1)
ax4[1,1].axhline(y=line4,color='red',linestyle='--')
ax4[1,1].set_title('事件发生两次')
ax4[1,1].set_xlabel('issue开启至第一次事件发生时间')
ax4[1,1].set_ylabel('事件第二次发生启至issue关闭时间')
fig5, ax5 = plt.subplots(2, 2)
fig5.suptitle('link')
line5 = link_without_time/link_without_count/3600/24
ax5[0,0].scatter(x_17,y_17,s=1)
ax5[0,0].axhline(y=line5,color='red',linestyle='--')
ax5[0,0].set_title('事件发生一次')
ax5[0,0].set_xlabel('issue开启至事件发生时间')
ax5[0,0].set_ylabel('事件发生至issue关闭时间')
ax5[0,1].scatter(x_18,y_18,s=1)
ax5[0,1].axhline(y=line5,color='red',linestyle='--')
ax5[0,1].set_title('事件发生一次')
ax5[0,1].set_xlabel('issue开启至事件发生时间')
ax5[0,1].set_ylabel('issue开启至issue关闭时间')
ax5[1,0].scatter(x_19,y_19,s=1)
ax5[1,0].axhline(y=line5,color='red',linestyle='--')
ax5[1,0].set_title('事件发生两次')
ax5[1,0].set_xlabel('issue开启至第一次事件发生时间')
ax5[1,0].set_ylabel('事件第一次发生启至issue关闭时间')
ax5[1,1].scatter(x_20,y_20,s=1)
ax5[1,1].axhline(y=line5,color='red',linestyle='--')
ax5[1,1].set_title('事件发生两次')
ax5[1,1].set_xlabel('issue开启至第一次事件发生时间')
ax5[1,1].set_ylabel('事件第二次发生启至issue关闭时间')
plt.tight_layout()
plt.show()
for i in range(20):
count = 0
for j in range(len(locals()['y_'+str(i+1)])):
if locals()['y_'+str(i+1)][j]< locals()['line'+str(i//4+1)]:
count += 1
print('y_'+str(i%4+1)+':')
print(count/len(locals()['y_'+str(i+1)])*100,end='%')
print('\n')
# count = 0
# for i in range(len(y_1)):
# if y_1[i]< line:
# count += 1
# print(count/len(y_1)*100,end='%')
# a = [5043, 1547, 803, 418, 274, 150, 115, 82, 61, 44, 38, 24, 27, 16, 9, 14, 14, 5, 9, 9, 1, 3, 4, 2, 2, 0, 2, 0, 1, 0]