198 lines
6.3 KiB
Python
198 lines
6.3 KiB
Python
import numpy as np
|
||
import time
|
||
import argparse
|
||
import sys
|
||
from math import *
|
||
from collections import Counter
|
||
from calendar import Calendar
|
||
import datetime
|
||
from elasticsearch import Elasticsearch
|
||
es = Elasticsearch(['http://106.75.10.84:9200'])
|
||
|
||
def get_eventLst(repo):
|
||
###获取repo的所有event
|
||
frm = 1
|
||
sz = 100
|
||
##查询从frm开始的sz个event
|
||
eventLst = []
|
||
while 1:
|
||
query={"query":{"bool":{"must":[{"match":{"search_fields.repo":repo}},]}},'from':frm,'size':sz}
|
||
value = es.search(index="github_event_raw",body=query,_source=['data'])
|
||
if value['hits']['hits'] == []:
|
||
break
|
||
else:
|
||
frm += sz
|
||
for i in value['hits']['hits']:
|
||
item = {}
|
||
item['issue_number'] = i['_source']['data']['issue']['number']
|
||
item['event'] = i['_source']['data']['eventType']
|
||
item['create_at'] = i['_source']['data']['createdAt']
|
||
eventLst.append(item)
|
||
break
|
||
return eventLst
|
||
|
||
def filterEvent(startTime,endTime):
|
||
filterEventLst = []
|
||
for event in eventLst:
|
||
if event['create_at'] > startTime and event['create_at'] < endTime:
|
||
filterEventLst.append(event)
|
||
issue2eventCount = {}
|
||
for i in filterEventLst:
|
||
if issue2eventCount.get(i['issue_number']):
|
||
if issue2eventCount[i['issue_number']].get(i['event']):
|
||
issue2eventCount[i['issue_number']][i['event']] += 1
|
||
else:
|
||
issue2eventCount[i['issue_number']][i['event']] = 1
|
||
else:
|
||
issue2eventCount[i['issue_number']]={i['event']:1}
|
||
return issue2eventCount
|
||
|
||
def day2timeStamp(string):
|
||
if len(string) == 10:
|
||
return int(time.mktime(time.strptime(string,"%Y-%m-%d")))
|
||
elif len(string) == 20:
|
||
return int(time.mktime(time.strptime(string,"%Y-%m-%dT%H:%M:%SZ")))
|
||
else:
|
||
return -1
|
||
|
||
#计算熵的公式
|
||
def lstToEntropy(lst):
|
||
result = 0
|
||
for i in lst:
|
||
pi = i/sum(lst)
|
||
result += -pi*log(pi,2)
|
||
return result*sum(lst)
|
||
|
||
#针对整理好的计算熵的数据,求每个时间段的熵,输出x、y,即时间和熵
|
||
#input time2issuesEvent:{time:{issue_number:{event:count}}}
|
||
#output {x:y}
|
||
def dataToEntropy(time2issuesEvent):
|
||
x = time2issuesEvent.keys()
|
||
result = {}
|
||
for i in x:
|
||
issue2event2count = time2issuesEvent[i]
|
||
if issue2event2count:
|
||
tmpResult = 0
|
||
for item in issue2event2count.values():
|
||
tmpResult += lstToEntropy(list(item.values()) + [1])
|
||
result[i] = tmpResult
|
||
#result[i] = tmpResult/len(issue2event2count.values())
|
||
else:
|
||
result[i] = 0
|
||
return result
|
||
|
||
#汇总
|
||
def timelstToEntropy(timelst):
|
||
time2issuesEvent = {}
|
||
for i in range(len(timelst) - 1):
|
||
time2issuesEvent[timelst[i]] = filterEvent(timelst[i],timelst[i+1])
|
||
result = dataToEntropy(time2issuesEvent)
|
||
return result
|
||
|
||
#####################################下面是执行函数#################################
|
||
|
||
#计算某一个项目的激发熵随时间变化
|
||
#input repo:str
|
||
#output (x,y):(lst,lst)
|
||
#timelst=['2016-09-29', '2016-09-30']时,计算2016-09-29当天产生的熵
|
||
#timelst=['2016-09-29', '2016-09-30', '2016-10-01']时,计算2016-09-29, 2016-09-30两天产生的熵
|
||
def calEntropy(repo,timelst):
|
||
result = timelstToEntropy(timelst)
|
||
x = list(result.keys())
|
||
y = list(result.values())
|
||
return (x,y)
|
||
|
||
def calEntropy(repo,year):
|
||
pass
|
||
##获取计算熵的时间轴
|
||
def get_date():
|
||
c = Calendar()
|
||
dateLst = []
|
||
for year in range(2015,2054):
|
||
for month in range(1,13):
|
||
dateLst += [str(date) for date in c.itermonthdates(year,month)]
|
||
dateLst = list(set(dateLst))
|
||
dateLst.sort()
|
||
return dateLst
|
||
|
||
def get_year(year):
|
||
c = Calendar()
|
||
dateLst = []
|
||
for month in range(1,13):
|
||
dateLst += [str(date) for date in c.itermonthdates(year,month)]
|
||
dateLst = list(set(dateLst))
|
||
dateLst.sort()
|
||
del_lst = []
|
||
for i in dateLst:
|
||
if i[:4] != str(year):
|
||
del_lst.append(i)
|
||
for i in del_lst:
|
||
dateLst.remove(i)
|
||
dateLst.append(str(year+1)+'-01-01')
|
||
today_date = datetime.date.today()
|
||
if(str(today_date) in dateLst):
|
||
index = dateLst.index(str(today_date))
|
||
dateLst = dateLst[:index+1]
|
||
return dateLst
|
||
|
||
def calEntropyYear(repo,year):
|
||
timelst = get_year(year)
|
||
result = timelstToEntropy(timelst)
|
||
time = list(result.keys())
|
||
entropy = list(result.values())
|
||
time2entropy = dict(zip(time,entropy))
|
||
return time2entropy
|
||
|
||
def get_day(day):
|
||
c = Calendar()
|
||
##判断是否是今天前
|
||
today_date = datetime.date.today()
|
||
if str(day) > str(today_date):
|
||
print('error:day > today_date')
|
||
else:
|
||
dateLst = []
|
||
dateLst.append(str(day))
|
||
dateLst.append(str(datetime.date(int(day[:4]),int(day[5:7]),int(day[8:10]))+datetime.timedelta(days=1)))
|
||
return dateLst
|
||
|
||
def calEntropyDay(repo,day):
|
||
timelst = get_day(day)
|
||
result = timelstToEntropy(timelst)
|
||
time = list(result.keys())
|
||
entropy = list(result.values())
|
||
time2entropy = dict(zip(time,entropy))
|
||
return time2entropy
|
||
|
||
|
||
if __name__ == '__main__':
|
||
###输入
|
||
parser = argparse.ArgumentParser()
|
||
parser.add_argument("-d","--day", default='2023-01-01', help="calculate the date of a day. example: 2023-01-01",type=str)
|
||
parser.add_argument("-r","--repo", default='Paddle', help="repo name",type=str)
|
||
args = parser.parse_args()
|
||
|
||
repo = args.repo
|
||
day = args.day
|
||
|
||
###计算
|
||
global eventLst
|
||
eventLst = get_eventLst(repo)
|
||
result = {}
|
||
|
||
time2entropy = calEntropyDay(repo,day)
|
||
result['data'] = time2entropy
|
||
result['repo'] = repo
|
||
result['day'] = day
|
||
print(result)
|
||
|
||
## timelst = get_date()
|
||
## today_date = datetime.date.today()
|
||
## index = timelst.index(str(today_date))
|
||
## timelst = timelst[:index]
|
||
##
|
||
## #得到time列表['2016-09-29',...],熵值列表[0,...],
|
||
## time,entropy = calEntropy(repo,timelst)
|
||
## time2entropy = dict(zip(time,entropy))
|
||
## np.save(repo+'_time2entropy.npy',time2entropy)
|
||
|