ADD file via upload

This commit is contained in:
sy00000 2023-02-03 10:09:59 +08:00
parent 8507273634
commit 505bcc9221
1 changed files with 197 additions and 0 deletions

197
calEntropyPerDay.py Normal file
View File

@ -0,0 +1,197 @@
import numpy as np
import time
import argparse
import sys
from math import *
from collections import Counter
from calendar import Calendar
import datetime
from elasticsearch import Elasticsearch
es = Elasticsearch(['http://106.75.10.84:9200'])
def get_eventLst(repo):
###获取repo的所有event
frm = 1
sz = 100
##查询从frm开始的sz个event
eventLst = []
while 1:
query={"query":{"bool":{"must":[{"match":{"search_fields.repo":repo}},]}},'from':frm,'size':sz}
value = es.search(index="github_event_raw",body=query,_source=['data'])
if value['hits']['hits'] == []:
break
else:
frm += sz
for i in value['hits']['hits']:
item = {}
item['issue_number'] = i['_source']['data']['issue']['number']
item['event'] = i['_source']['data']['eventType']
item['create_at'] = i['_source']['data']['createdAt']
eventLst.append(item)
break
return eventLst
def filterEvent(startTime,endTime):
filterEventLst = []
for event in eventLst:
if event['create_at'] > startTime and event['create_at'] < endTime:
filterEventLst.append(event)
issue2eventCount = {}
for i in filterEventLst:
if issue2eventCount.get(i['issue_number']):
if issue2eventCount[i['issue_number']].get(i['event']):
issue2eventCount[i['issue_number']][i['event']] += 1
else:
issue2eventCount[i['issue_number']][i['event']] = 1
else:
issue2eventCount[i['issue_number']]={i['event']:1}
return issue2eventCount
def day2timeStamp(string):
if len(string) == 10:
return int(time.mktime(time.strptime(string,"%Y-%m-%d")))
elif len(string) == 20:
return int(time.mktime(time.strptime(string,"%Y-%m-%dT%H:%M:%SZ")))
else:
return -1
#计算熵的公式
def lstToEntropy(lst):
result = 0
for i in lst:
pi = i/sum(lst)
result += -pi*log(pi,2)
return result*sum(lst)
#针对整理好的计算熵的数据求每个时间段的熵输出x、y即时间和熵
#input time2issuesEvent{time:{issue_number:{eventcount}}}
#output {x:y}
def dataToEntropy(time2issuesEvent):
x = time2issuesEvent.keys()
result = {}
for i in x:
issue2event2count = time2issuesEvent[i]
if issue2event2count:
tmpResult = 0
for item in issue2event2count.values():
tmpResult += lstToEntropy(list(item.values()) + [1])
result[i] = tmpResult
#result[i] = tmpResult/len(issue2event2count.values())
else:
result[i] = 0
return result
#汇总
def timelstToEntropy(timelst):
time2issuesEvent = {}
for i in range(len(timelst) - 1):
time2issuesEvent[timelst[i]] = filterEvent(timelst[i],timelst[i+1])
result = dataToEntropy(time2issuesEvent)
return result
#####################################下面是执行函数#################################
#计算某一个项目的激发熵随时间变化
#input repo:str
#output (x,y):(lst,lst)
#timelst=['2016-09-29', '2016-09-30']时计算2016-09-29当天产生的熵
#timelst=['2016-09-29', '2016-09-30', '2016-10-01']时计算2016-09-29, 2016-09-30两天产生的熵
def calEntropy(repo,timelst):
result = timelstToEntropy(timelst)
x = list(result.keys())
y = list(result.values())
return (x,y)
def calEntropy(repo,year):
pass
##获取计算熵的时间轴
def get_date():
c = Calendar()
dateLst = []
for year in range(2015,2054):
for month in range(1,13):
dateLst += [str(date) for date in c.itermonthdates(year,month)]
dateLst = list(set(dateLst))
dateLst.sort()
return dateLst
def get_year(year):
c = Calendar()
dateLst = []
for month in range(1,13):
dateLst += [str(date) for date in c.itermonthdates(year,month)]
dateLst = list(set(dateLst))
dateLst.sort()
del_lst = []
for i in dateLst:
if i[:4] != str(year):
del_lst.append(i)
for i in del_lst:
dateLst.remove(i)
dateLst.append(str(year+1)+'-01-01')
today_date = datetime.date.today()
if(str(today_date) in dateLst):
index = dateLst.index(str(today_date))
dateLst = dateLst[:index+1]
return dateLst
def calEntropyYear(repo,year):
timelst = get_year(year)
result = timelstToEntropy(timelst)
time = list(result.keys())
entropy = list(result.values())
time2entropy = dict(zip(time,entropy))
return time2entropy
def get_day(day):
c = Calendar()
##判断是否是今天前
today_date = datetime.date.today()
if str(day) > str(today_date):
print('error:day > today_date')
else:
dateLst = []
dateLst.append(str(day))
dateLst.append(str(datetime.date(int(day[:4]),int(day[5:7]),int(day[8:10]))+datetime.timedelta(days=1)))
return dateLst
def calEntropyDay(repo,day):
timelst = get_day(day)
result = timelstToEntropy(timelst)
time = list(result.keys())
entropy = list(result.values())
time2entropy = dict(zip(time,entropy))
return time2entropy
if __name__ == '__main__':
###输入
parser = argparse.ArgumentParser()
parser.add_argument("-d","--day", default='2023-01-01', help="calculate the date of a day. example: 2023-01-01",type=str)
parser.add_argument("-r","--repo", default='Paddle', help="repo name",type=str)
args = parser.parse_args()
repo = args.repo
day = args.day
###计算
global eventLst
eventLst = get_eventLst(repo)
result = {}
time2entropy = calEntropyDay(repo,day)
result['data'] = time2entropy
result['repo'] = repo
result['day'] = day
print(result)
## timelst = get_date()
## today_date = datetime.date.today()
## index = timelst.index(str(today_date))
## timelst = timelst[:index]
##
## #得到time列表['2016-09-29',...],熵值列表[0,...]
## time,entropy = calEntropy(repo,timelst)
## time2entropy = dict(zip(time,entropy))
## np.save(repo+'_time2entropy.npy',time2entropy)