github_mongoDB_dataprocess/Comprehensive Analysis.py

193 lines
8.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from pymongo import MongoClient
from datetime import datetime
import matplotlib.pyplot as plt
import re
import matplotlib
import numpy as np
#计算时间差值,输出为秒数
def get_time_difference(t_old,t_new):
time_diff_seconds = (t_new-t_old).total_seconds()
return time_diff_seconds
#计算列表元素时间差
def get_time_difference_list(time_list):
interval = list()
for i in range(1,len(time_list)):
interval.append(get_time_difference(time_list[i-1],time_list[i]))
return interval
#连接数据库
client = MongoClient('localhost', 27017)
db = client['numpy_db']
collection1 = db['issue_pr']
collection2 = db['issue_timeline_new']
#已关闭的issue信息
issue_data = collection1.find({"state":"closed", "pull_request": {"$exists": False}},
{ "number": 1, "created_at": 1, "closed_at":1,"body":1,"title":1, "_id": 0 }).sort([("number", 1)]).batch_size(10)
i = 0
ave_time = [0]*6
# count = [0]*6
combinations = {
(a, b, c, d, e): 0 for a in [0,1] for b in [0,1] for c in [0,1] for d in [0,1] for e in [0,1]
}
num = {
(a, b, c, d, e): 0 for a in [0,1] for b in [0,1] for c in [0,1] for d in [0,1] for e in [0,1]
}
count = {
(a, b, c, d, e): 0 for a in [0,1] for b in [0,1] for c in [0,1] for d in [0,1] for e in [0,1]
}
for issue in issue_data:
print(issue['number'],end=' ')
#获取五种事件类型的时间戳:@、assigned、labeled、linkcross-reference
commented_data = collection2.find(
{"event": "commented", "issue_number": issue['number']},
{"created_at": 1, "body": 1, "_id": 0}).sort([("created_at", 1)])
pattern = r'github.com/[a-zA-Z0-9-_.]+/[a-zA-Z0-9-_.]+/issues/[0-9]+|github.com/[a-zA-Z0-9-_.]+/[a-zA-Z0-9-_.]+/pull/[0-9]+'
link_time = []
for event in commented_data:
for i in range(len(re.findall(pattern, event['body']))):
link_time.append(datetime.strptime(event["created_at"], '%Y-%m-%dT%H:%M:%SZ'))
# if issue['body']:
# for j in range(len(re.findall(pattern, issue['body']))):
# link_time.append(datetime.strptime(issue["created_at"], '%Y-%m-%dT%H:%M:%SZ'))
# if issue['title']:
# for j in range(len(re.findall(pattern, issue['title']))):
# link_time.append(datetime.strptime(issue["created_at"], '%Y-%m-%dT%H:%M:%SZ'))
# link_time = [datetime.strptime(event["created_at"], '%Y-%m-%dT%H:%M:%SZ') for event in commented_data if re.findall(pattern,event['body'])]
link_time.sort()
# print(len(link_time))
labeled_data = collection2.find(
{ "event":'labeled',"issue_number": issue['number']},
{"created_at": 1, "_id": 0 }).sort([("created_at", 1)])
# labeled_time = [datetime.strptime(event["created_at"], '%Y-%m-%dT%H:%M:%SZ') for event in labeled_data if event['label']['name'] == 'duplicate']
labeled_time = [datetime.strptime(event["created_at"], '%Y-%m-%dT%H:%M:%SZ') for event in labeled_data]
labeled_time.sort()
mentioned_data = collection2.find(
{"event": "mentioned", "issue_number": issue['number']},
{"created_at": 1, "_id": 0}).sort([("created_at", 1)])
mentioned_time = [datetime.strptime(event["created_at"], '%Y-%m-%dT%H:%M:%SZ') for event in mentioned_data]
mentioned_time.sort()
assigned_data = collection2.find(
{"event": "assigned", "issue_number": issue['number']},
{"created_at": 1, "_id": 0}).sort([("created_at", 1)])
assigned_time = [datetime.strptime(event["created_at"], '%Y-%m-%dT%H:%M:%SZ') for event in assigned_data]
assigned_time.sort()
cross_referenced_data = collection2.find(
{"event": "cross-referenced", "issue_number": issue['number']},
{"created_at": 1, "_id": 0}).sort([("created_at", 1)])
cross_referenced_time = [datetime.strptime(event["created_at"], '%Y-%m-%dT%H:%M:%SZ') for event in
cross_referenced_data]
cross_referenced_time.sort()
# #五种时间类型分别计算时间差,综合事件类型时间差
labeled_time.insert(0, datetime.strptime(issue["created_at"], '%Y-%m-%dT%H:%M:%SZ')) # 添加issue创建、关闭的时间戳
labeled_time.append(datetime.strptime(issue["closed_at"], '%Y-%m-%dT%H:%M:%SZ'))
for time in labeled_time[0:-1]: # 删除发生在issue关闭后的事件
if get_time_difference(time, labeled_time[-1]) < 0:
labeled_time.remove(time)
labeled_interval_list = get_time_difference_list(labeled_time)
if all(x < 10 for x in labeled_interval_list): # 删除机器人触发的事件
continue
mentioned_time.insert(0, datetime.strptime(issue["created_at"], '%Y-%m-%dT%H:%M:%SZ')) # 添加issue创建、关闭的时间戳
mentioned_time.append(datetime.strptime(issue["closed_at"], '%Y-%m-%dT%H:%M:%SZ'))
for time in mentioned_time[0:-1]: # 删除发生在issue关闭后的事件
if get_time_difference(time, mentioned_time[-1]) < 0:
mentioned_time.remove(time)
mentioned_interval_list = get_time_difference_list(mentioned_time)
if all(x < 10 for x in mentioned_interval_list): # 删除机器人触发的事件
continue
assigned_time.insert(0, datetime.strptime(issue["created_at"], '%Y-%m-%dT%H:%M:%SZ')) # 添加issue创建、关闭的时间戳
assigned_time.append(datetime.strptime(issue["closed_at"], '%Y-%m-%dT%H:%M:%SZ'))
for time in assigned_time[0:-1]: # 删除发生在issue关闭后的事件
if get_time_difference(time, assigned_time[-1]) < 0:
assigned_time.remove(time)
assigned_interval_list = get_time_difference_list(assigned_time)
if all(x < 10 for x in assigned_interval_list): # 删除机器人触发的事件
continue
cross_referenced_time.insert(0, datetime.strptime(issue["created_at"], '%Y-%m-%dT%H:%M:%SZ')) # 添加issue创建、关闭的时间戳
cross_referenced_time.append(datetime.strptime(issue["closed_at"], '%Y-%m-%dT%H:%M:%SZ'))
for time in cross_referenced_time[0:-1]: # 删除发生在issue关闭后的事件
if get_time_difference(time, cross_referenced_time[-1]) < 0:
cross_referenced_time.remove(time)
cross_referenced_interval_list = get_time_difference_list(cross_referenced_time)
if all(x < 10 for x in cross_referenced_interval_list): # 删除机器人触发的事件
continue
link_time.insert(0, datetime.strptime(issue["created_at"], '%Y-%m-%dT%H:%M:%SZ')) # 添加issue创建、关闭的时间戳
link_time.append(datetime.strptime(issue["closed_at"], '%Y-%m-%dT%H:%M:%SZ'))
for time in link_time[0:-1]: # 删除发生在issue关闭后的事件
if get_time_difference(time, link_time[-1]) < 0:
link_time.remove(time)
link_interval_list = get_time_difference_list(link_time)
if all(x < 10 for x in link_interval_list): # 删除机器人触发的事件
continue
# try:
# # ave_time[len(link_time)+len(cross_referenced_time)+len(assigned_time)+len(mentioned_time)+len(labeled_time)-10] += labeled_interval_list[-1]/3600/24
# # count[len(link_time)+len(cross_referenced_time)+len(assigned_time)+len(mentioned_time)+len(labeled_time)-10] += 1
# cnt = 0
# for x in [link_time,cross_referenced_time,assigned_time,mentioned_time,labeled_time]:
# if not len(x) == 2:
# cnt += 1
# ave_time[cnt] += get_time_difference(link_time[0],link_time[-1])
# count[cnt] += 1
# except:
# pass
try:
key = (int(len(link_time)>2),int(len(cross_referenced_time)>2),int(len(assigned_time)>2),int(len(mentioned_time)>2),int(len(labeled_time)>2))
combinations[key] += get_time_difference(link_time[0],link_time[-1])
num[key] += (len(link_time)+len(cross_referenced_time)+len(assigned_time)+len(mentioned_time)+len(labeled_time))
count[key] += 1
except:
pass
#
# if i>3:
# break
# i += 1
# for i in range(6):
# if count[i]:
# ave_time[i] = ave_time[i]/count[i]/3600/24
# times = [i for i in range(6)]
# print(ave_time)
# print(times)
#
# plt.plot(times,ave_time)
# plt.show()
for key in combinations.keys():
print(key,end='\t')
if count[key]:
combinations[key] = round(combinations[key]/count[key]/3600/24,2)
num[key] = round(num[key]/count[key],2)
print(combinations[key],end='\t')
print(num[key], end='\t')
if not num[key]:
print(round(combinations[key]/num[key],2), end='\t')
print(count[key])