calEntropy/calEntropyPerDay.py

198 lines
6.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import numpy as np
import time
import argparse
import sys
from math import *
from collections import Counter
from calendar import Calendar
import datetime
from elasticsearch import Elasticsearch
es = Elasticsearch(['http://106.75.10.84:9200'])
def get_eventLst(repo):
###获取repo的所有event
frm = 1
sz = 100
##查询从frm开始的sz个event
eventLst = []
while 1:
query={"query":{"bool":{"must":[{"match":{"search_fields.repo":repo}},]}},'from':frm,'size':sz}
value = es.search(index="github_event_raw",body=query,_source=['data'])
if value['hits']['hits'] == []:
break
else:
frm += sz
for i in value['hits']['hits']:
item = {}
item['issue_number'] = i['_source']['data']['issue']['number']
item['event'] = i['_source']['data']['eventType']
item['create_at'] = i['_source']['data']['createdAt']
eventLst.append(item)
break
return eventLst
def filterEvent(startTime,endTime):
filterEventLst = []
for event in eventLst:
if event['create_at'] > startTime and event['create_at'] < endTime:
filterEventLst.append(event)
issue2eventCount = {}
for i in filterEventLst:
if issue2eventCount.get(i['issue_number']):
if issue2eventCount[i['issue_number']].get(i['event']):
issue2eventCount[i['issue_number']][i['event']] += 1
else:
issue2eventCount[i['issue_number']][i['event']] = 1
else:
issue2eventCount[i['issue_number']]={i['event']:1}
return issue2eventCount
def day2timeStamp(string):
if len(string) == 10:
return int(time.mktime(time.strptime(string,"%Y-%m-%d")))
elif len(string) == 20:
return int(time.mktime(time.strptime(string,"%Y-%m-%dT%H:%M:%SZ")))
else:
return -1
#计算熵的公式
def lstToEntropy(lst):
result = 0
for i in lst:
pi = i/sum(lst)
result += -pi*log(pi,2)
return result*sum(lst)
#针对整理好的计算熵的数据求每个时间段的熵输出x、y即时间和熵
#input time2issuesEvent{time:{issue_number:{eventcount}}}
#output {x:y}
def dataToEntropy(time2issuesEvent):
x = time2issuesEvent.keys()
result = {}
for i in x:
issue2event2count = time2issuesEvent[i]
if issue2event2count:
tmpResult = 0
for item in issue2event2count.values():
tmpResult += lstToEntropy(list(item.values()) + [1])
result[i] = tmpResult
#result[i] = tmpResult/len(issue2event2count.values())
else:
result[i] = 0
return result
#汇总
def timelstToEntropy(timelst):
time2issuesEvent = {}
for i in range(len(timelst) - 1):
time2issuesEvent[timelst[i]] = filterEvent(timelst[i],timelst[i+1])
result = dataToEntropy(time2issuesEvent)
return result
#####################################下面是执行函数#################################
#计算某一个项目的激发熵随时间变化
#input repo:str
#output (x,y):(lst,lst)
#timelst=['2016-09-29', '2016-09-30']时计算2016-09-29当天产生的熵
#timelst=['2016-09-29', '2016-09-30', '2016-10-01']时计算2016-09-29, 2016-09-30两天产生的熵
def calEntropy(repo,timelst):
result = timelstToEntropy(timelst)
x = list(result.keys())
y = list(result.values())
return (x,y)
def calEntropy(repo,year):
pass
##获取计算熵的时间轴
def get_date():
c = Calendar()
dateLst = []
for year in range(2015,2054):
for month in range(1,13):
dateLst += [str(date) for date in c.itermonthdates(year,month)]
dateLst = list(set(dateLst))
dateLst.sort()
return dateLst
def get_year(year):
c = Calendar()
dateLst = []
for month in range(1,13):
dateLst += [str(date) for date in c.itermonthdates(year,month)]
dateLst = list(set(dateLst))
dateLst.sort()
del_lst = []
for i in dateLst:
if i[:4] != str(year):
del_lst.append(i)
for i in del_lst:
dateLst.remove(i)
dateLst.append(str(year+1)+'-01-01')
today_date = datetime.date.today()
if(str(today_date) in dateLst):
index = dateLst.index(str(today_date))
dateLst = dateLst[:index+1]
return dateLst
def calEntropyYear(repo,year):
timelst = get_year(year)
result = timelstToEntropy(timelst)
time = list(result.keys())
entropy = list(result.values())
time2entropy = dict(zip(time,entropy))
return time2entropy
def get_day(day):
c = Calendar()
##判断是否是今天前
today_date = datetime.date.today()
if str(day) > str(today_date):
print('error:day > today_date')
else:
dateLst = []
dateLst.append(str(day))
dateLst.append(str(datetime.date(int(day[:4]),int(day[5:7]),int(day[8:10]))+datetime.timedelta(days=1)))
return dateLst
def calEntropyDay(repo,day):
timelst = get_day(day)
result = timelstToEntropy(timelst)
time = list(result.keys())
entropy = list(result.values())
time2entropy = dict(zip(time,entropy))
return time2entropy
if __name__ == '__main__':
###输入
parser = argparse.ArgumentParser()
parser.add_argument("-d","--day", default='2023-01-01', help="calculate the date of a day. example: 2023-01-01",type=str)
parser.add_argument("-r","--repo", default='Paddle', help="repo name",type=str)
args = parser.parse_args()
repo = args.repo
day = args.day
###计算
global eventLst
eventLst = get_eventLst(repo)
result = {}
time2entropy = calEntropyDay(repo,day)
result['data'] = time2entropy
result['repo'] = repo
result['day'] = day
print(result)
## timelst = get_date()
## today_date = datetime.date.today()
## index = timelst.index(str(today_date))
## timelst = timelst[:index]
##
## #得到time列表['2016-09-29',...],熵值列表[0,...]
## time,entropy = calEntropy(repo,timelst)
## time2entropy = dict(zip(time,entropy))
## np.save(repo+'_time2entropy.npy',time2entropy)