【修改】项目部署说明
This commit is contained in:
parent
a6548650cc
commit
35f08e1322
|
@ -0,0 +1,160 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
'''=================================================
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :LuckyHuibo
|
||||||
|
@Date :2019/10/24 18:16
|
||||||
|
@Desc :
|
||||||
|
=================================================='''
|
||||||
|
|
||||||
|
import jieba
|
||||||
|
import numpy as np
|
||||||
|
import collections
|
||||||
|
from sklearn import feature_extraction
|
||||||
|
from sklearn.feature_extraction.text import TfidfTransformer
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
|
||||||
|
|
||||||
|
def split_sentence(text, punctuation_list='!?。!?'):
|
||||||
|
"""
|
||||||
|
将文本段安装标点符号列表里的符号切分成句子,将所有句子保存在列表里。
|
||||||
|
"""
|
||||||
|
sentence_set = []
|
||||||
|
inx_position = 0 # 索引标点符号的位置
|
||||||
|
char_position = 0 # 移动字符指针位置
|
||||||
|
for char in text:
|
||||||
|
char_position += 1
|
||||||
|
if char in punctuation_list:
|
||||||
|
next_char = list(text[inx_position:char_position + 1]).pop()
|
||||||
|
if next_char not in punctuation_list:
|
||||||
|
sentence_set.append(text[inx_position:char_position])
|
||||||
|
inx_position = char_position
|
||||||
|
if inx_position < len(text):
|
||||||
|
sentence_set.append(text[inx_position:])
|
||||||
|
|
||||||
|
sentence_with_index = {i: sent for i, sent in
|
||||||
|
enumerate(sentence_set)} # dict(zip(sentence_set, range(len(sentences))))
|
||||||
|
return sentence_set, sentence_with_index
|
||||||
|
|
||||||
|
|
||||||
|
def get_tfidf_matrix(sentence_set, stop_word):
|
||||||
|
corpus = []
|
||||||
|
for sent in sentence_set:
|
||||||
|
sent_cut = jieba.cut(sent)
|
||||||
|
sent_list = [word for word in sent_cut if word not in stop_word]
|
||||||
|
sent_str = ' '.join(sent_list)
|
||||||
|
corpus.append(sent_str)
|
||||||
|
|
||||||
|
vectorizer = CountVectorizer()
|
||||||
|
transformer = TfidfTransformer()
|
||||||
|
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
|
||||||
|
# word=vectorizer.get_feature_names()
|
||||||
|
tfidf_matrix = tfidf.toarray()
|
||||||
|
return np.array(tfidf_matrix)
|
||||||
|
|
||||||
|
|
||||||
|
def get_sentence_with_words_weight(tfidf_matrix):
|
||||||
|
sentence_with_words_weight = {}
|
||||||
|
for i in range(len(tfidf_matrix)):
|
||||||
|
sentence_with_words_weight[i] = np.sum(tfidf_matrix[i])
|
||||||
|
|
||||||
|
max_weight = max(sentence_with_words_weight.values()) # 归一化
|
||||||
|
min_weight = min(sentence_with_words_weight.values())
|
||||||
|
for key in sentence_with_words_weight.keys():
|
||||||
|
x = sentence_with_words_weight[key]
|
||||||
|
sentence_with_words_weight[key] = (x - min_weight) / (max_weight - min_weight)
|
||||||
|
|
||||||
|
return sentence_with_words_weight
|
||||||
|
|
||||||
|
|
||||||
|
def get_sentence_with_position_weight(sentence_set):
|
||||||
|
sentence_with_position_weight = {}
|
||||||
|
total_sent = len(sentence_set)
|
||||||
|
for i in range(total_sent):
|
||||||
|
sentence_with_position_weight[i] = (total_sent - i) / total_sent
|
||||||
|
return sentence_with_position_weight
|
||||||
|
|
||||||
|
|
||||||
|
def similarity(sent1, sent2):
|
||||||
|
"""
|
||||||
|
计算余弦相似度
|
||||||
|
"""
|
||||||
|
return np.sum(sent1 * sent2) / 1e-6 + (np.sqrt(np.sum(sent1 * sent1)) * \
|
||||||
|
np.sqrt(np.sum(sent2 * sent2)))
|
||||||
|
|
||||||
|
|
||||||
|
def get_similarity_weight(tfidf_matrix):
|
||||||
|
sentence_score = collections.defaultdict(lambda: 0.)
|
||||||
|
for i in range(len(tfidf_matrix)):
|
||||||
|
score_i = 0.
|
||||||
|
for j in range(len(tfidf_matrix)):
|
||||||
|
score_i += similarity(tfidf_matrix[i], tfidf_matrix[j])
|
||||||
|
sentence_score[i] = score_i
|
||||||
|
|
||||||
|
max_score = max(sentence_score.values()) # 归一化
|
||||||
|
min_score = min(sentence_score.values())
|
||||||
|
for key in sentence_score.keys():
|
||||||
|
x = sentence_score[key]
|
||||||
|
sentence_score[key] = (x - min_score) / (max_score - min_score)
|
||||||
|
|
||||||
|
return sentence_score
|
||||||
|
|
||||||
|
|
||||||
|
def ranking_base_on_weigth(sentence_with_words_weight,
|
||||||
|
sentence_with_position_weight,
|
||||||
|
sentence_score, feature_weight=[1, 1, 1]):
|
||||||
|
sentence_weight = collections.defaultdict(lambda: 0.)
|
||||||
|
for sent in sentence_score.keys():
|
||||||
|
sentence_weight[sent] = feature_weight[0] * sentence_with_words_weight[sent] + \
|
||||||
|
feature_weight[1] * sentence_with_position_weight[sent] + \
|
||||||
|
feature_weight[2] * sentence_score[sent]
|
||||||
|
|
||||||
|
sort_sent_weight = sorted(sentence_weight.items(), key=lambda d: d[1], reverse=True)
|
||||||
|
return sort_sent_weight
|
||||||
|
|
||||||
|
|
||||||
|
def get_summarization(sentence_with_index, sort_sent_weight, topK_ratio=0.3):
|
||||||
|
topK = int(len(sort_sent_weight) * topK_ratio)
|
||||||
|
print('topK:{0}'.format(topK))
|
||||||
|
summarization_sent = sorted([sent[0] for sent in sort_sent_weight[:topK]])
|
||||||
|
|
||||||
|
summarization = []
|
||||||
|
for i in summarization_sent:
|
||||||
|
summarization.append(sentence_with_index[i])
|
||||||
|
|
||||||
|
summary = ''.join(summarization)
|
||||||
|
return summary
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# test_text = '../../data/training17.txt'
|
||||||
|
# with open(test_text, 'r', encoding='utf-8') as f:
|
||||||
|
# text = f.read()
|
||||||
|
|
||||||
|
text = '''网易娱乐7月21日报道 林肯公园主唱查斯特·贝宁顿Chester Bennington于今天早上,在洛杉矶帕洛斯弗迪斯的一个私人庄园自缢身亡,年仅41岁。此消息已得到洛杉矶警方证实。
|
||||||
|
|
||||||
|
洛杉矶警方透露,Chester的家人正在外地度假,Chester独自在家,上吊地点是家里的二楼。一说是一名音乐公司工作人员来家里找他时发现了尸体,也有人称是佣人最早发现其死亡。
|
||||||
|
|
||||||
|
林肯公园另一位主唱麦克·信田确认了Chester Bennington自杀属实,并对此感到震惊和心痛,称稍后官方会发布声明。Chester昨天还在推特上转发了一条关于曼哈顿垃圾山的新闻。粉丝们纷纷在该推文下留言,不相信Chester已经走了。
|
||||||
|
外媒猜测,Chester选择在7月20日自杀的原因跟他极其要好的朋友、Soundgarden(声音花园)乐队以及Audioslave乐队主唱Chris Cornell有关,因为7月20日是Chris Cornell的诞辰。而Chris Cornell于今年5月17日上吊自杀,享年52岁。Chris去世后,Chester还为他写下悼文。
|
||||||
|
对于Chester的自杀,亲友表示震惊但不意外,因为Chester曾经透露过想自杀的念头,他曾表示自己童年时被虐待,导致他医生无法走出阴影,也导致他长期酗酒和嗑药来疗伤。目前,洛杉矶警方仍在调查Chester的死因。
|
||||||
|
据悉,Chester与毒品和酒精斗争多年,年幼时期曾被成年男子性侵,导致常有轻生念头。Chester生前有过2段婚姻,育有6个孩子。
|
||||||
|
林肯公园在今年五月发行了新专辑《多一丝曙光One More Light》,成为他们第五张登顶Billboard排行榜的专辑。而昨晚刚刚发布新单《Talking To Myself》MV。'''
|
||||||
|
|
||||||
|
stop_word = []
|
||||||
|
|
||||||
|
# 这个停用词表增加了很多中文的
|
||||||
|
with open('../../data/stopWordList.txt', 'r', encoding='utf-8') as f:
|
||||||
|
for line in f.readlines():
|
||||||
|
stop_word.append(line.strip())
|
||||||
|
|
||||||
|
sentence_set, sentence_with_index = split_sentence(text, punctuation_list='!?。!?')
|
||||||
|
tfidf_matrix = get_tfidf_matrix(sentence_set, stop_word)
|
||||||
|
sentence_with_words_weight = get_sentence_with_words_weight(tfidf_matrix)
|
||||||
|
sentence_with_position_weight = get_sentence_with_position_weight(sentence_set)
|
||||||
|
sentence_score = get_similarity_weight(tfidf_matrix)
|
||||||
|
sort_sent_weight = ranking_base_on_weigth(sentence_with_words_weight,
|
||||||
|
sentence_with_position_weight,
|
||||||
|
sentence_score, feature_weight=[1, 1, 1])
|
||||||
|
summarization = get_summarization(sentence_with_index, sort_sent_weight, topK_ratio=0.3)
|
||||||
|
print('summarization:\n', summarization)
|
|
@ -0,0 +1,303 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
'''=================================================
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :LuckyHuibo
|
||||||
|
@Date :2019/10/23 18:29
|
||||||
|
@Desc :
|
||||||
|
=================================================='''
|
||||||
|
from gensim.models import KeyedVectors
|
||||||
|
import numpy as np
|
||||||
|
from textrank4zh import TextRank4Keyword, TextRank4Sentence
|
||||||
|
from pyltp import SentenceSplitter
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
import re
|
||||||
|
import jieba
|
||||||
|
import operator
|
||||||
|
from functools import reduce
|
||||||
|
from gensim.models import LdaModel
|
||||||
|
from gensim.corpora import Dictionary
|
||||||
|
import gc
|
||||||
|
|
||||||
|
|
||||||
|
class SentenceEmbedding:
|
||||||
|
# 句子向量化类
|
||||||
|
def __init__(self):
|
||||||
|
self.word_frequence = self.__get_word_frequence()
|
||||||
|
|
||||||
|
def get_sentences_vec(self, model_wv, sent_list):
|
||||||
|
# 句子向量化处理
|
||||||
|
a = 0.001
|
||||||
|
row = model_wv.vector_size
|
||||||
|
col = len(sent_list)
|
||||||
|
sent_mat = np.zeros((row, col))
|
||||||
|
for i, sent in enumerate(sent_list):
|
||||||
|
length = len(sent)
|
||||||
|
if length == 0: continue
|
||||||
|
sent_vec = np.zeros(row)
|
||||||
|
for word in sent:
|
||||||
|
pw = self.word_frequence[word]
|
||||||
|
if pw == 0: continue
|
||||||
|
w = a / (a + pw)
|
||||||
|
# print(w)
|
||||||
|
try:
|
||||||
|
vec = np.array(model_wv[word])
|
||||||
|
sent_vec += w * vec
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
sent_mat[:, i] += sent_vec
|
||||||
|
sent_mat[:, i] /= length
|
||||||
|
|
||||||
|
# PCA处理
|
||||||
|
# print(sent_mat.shape)
|
||||||
|
|
||||||
|
sent_mat = np.mat(sent_mat)
|
||||||
|
u, s, vh = np.linalg.svd(sent_mat)
|
||||||
|
sent_mat = sent_mat - u * u.T * sent_mat
|
||||||
|
return sent_mat
|
||||||
|
|
||||||
|
def __get_word_frequence(self):
|
||||||
|
# 这里不做停用次处理,直接在计算句子向量时候,如果找不到该词,直接跳过
|
||||||
|
path = Myconfig.get_path('frequency.txt')
|
||||||
|
assert path
|
||||||
|
with open(path, 'rb') as f:
|
||||||
|
word_frequence = pickle.load(f)
|
||||||
|
return word_frequence
|
||||||
|
|
||||||
|
# 计算余弦相似度
|
||||||
|
def cos_similarity(self, v1, v2):
|
||||||
|
assert isinstance(v1, np.ndarray)
|
||||||
|
assert isinstance(v2, np.ndarray)
|
||||||
|
# 输入向量维度不一致
|
||||||
|
if len(v1) != len(v2):
|
||||||
|
return 0
|
||||||
|
if np.linalg.norm(v2) == 0 or np.linalg.norm(v1) == 0:
|
||||||
|
return 0
|
||||||
|
return np.vdot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
|
||||||
|
|
||||||
|
# 返回句子向量矩阵中各列向量与第一列向量的相似度
|
||||||
|
def __calcu_similarity(self, sent_mat):
|
||||||
|
assert (isinstance(sent_mat, np.ndarray) or isinstance(sent_mat, np.matrix))
|
||||||
|
# 采用点积的方法计算
|
||||||
|
first = np.array(sent_mat[:, 0]).flatten()
|
||||||
|
col = sent_mat.shape[1]
|
||||||
|
sims = []
|
||||||
|
for i in range(1, col):
|
||||||
|
vec = np.array(sent_mat[:, i]).flatten()
|
||||||
|
sims.append(self.cos_similarity(first, vec))
|
||||||
|
return sims
|
||||||
|
|
||||||
|
# 获取相似度结果#输入句子中每一句和首句的相似度
|
||||||
|
def get_similarity_result(self, model_wv, sent_list):
|
||||||
|
sent_mat = self.get_sentences_vec(model_wv, sent_list)
|
||||||
|
sim = self.__calcu_similarity(sent_mat)
|
||||||
|
return sim
|
||||||
|
|
||||||
|
|
||||||
|
# def test(sens, sim):
|
||||||
|
# print('##################################')
|
||||||
|
# index = list(np.argsort(sim))
|
||||||
|
# index.reverse()
|
||||||
|
# for i in index:
|
||||||
|
# print(sim[i], sens[i])
|
||||||
|
|
||||||
|
|
||||||
|
class Summarization:
|
||||||
|
def __init__(self):
|
||||||
|
self.position_re_weight = True
|
||||||
|
self.Sen_Embedding = SentenceEmbedding()
|
||||||
|
self.stopwords = self.__get_stopwords()
|
||||||
|
fname = Myconfig.get_path('vec.kv') # 或取模型目录
|
||||||
|
assert fname
|
||||||
|
self.model_wv = KeyedVectors.load(fname, mmap='r')
|
||||||
|
|
||||||
|
def __get_stopwords(self):
|
||||||
|
path = Myconfig.get_path('stopwords.txt')
|
||||||
|
stopwords = []
|
||||||
|
with open(path, encoding='GBK') as f:
|
||||||
|
line = f.readline()
|
||||||
|
while line != '':
|
||||||
|
stopwords.append(line.strip('\n'))
|
||||||
|
line = f.readline()
|
||||||
|
stopwords.append(' ')
|
||||||
|
return set(stopwords)
|
||||||
|
|
||||||
|
def __get_keyword(self, string):
|
||||||
|
tr4w = TextRank4Keyword()
|
||||||
|
tr4w.analyze(text=string, lower=True, window=4)
|
||||||
|
keyword_items = tr4w.get_keywords(10, word_min_len=2)
|
||||||
|
# 把权重标准化
|
||||||
|
keyword_items = sorted(keyword_items, key=lambda x: x.weight)
|
||||||
|
over_length = keyword_items[-1].weight
|
||||||
|
for wp in keyword_items:
|
||||||
|
wp.weight /= over_length
|
||||||
|
return keyword_items
|
||||||
|
|
||||||
|
# 用正则表达式进行切句
|
||||||
|
def __split_sentence(self, string):
|
||||||
|
pattern = re.compile('[。,,.??!!""“”]')
|
||||||
|
pattern1 = re.compile('\w+?([。,,.??!!""“”])')
|
||||||
|
flags = pattern1.findall(string)
|
||||||
|
sentences = pattern.sub('***', string).split('***')
|
||||||
|
sentences = [sen for sen in sentences if sen != '']
|
||||||
|
if (len(sentences) > len(flags)): flags.append('.')
|
||||||
|
# 把句子长度小于4的剔除,一般这些都是转折等过渡语句,会干扰句子提取
|
||||||
|
filter_index = [i for i in range(len(sentences)) if len(sentences[i]) >= 4]
|
||||||
|
sentences = [sentences[i] for i in filter_index]
|
||||||
|
flags = [flags[i] for i in filter_index]
|
||||||
|
|
||||||
|
return sentences, flags
|
||||||
|
|
||||||
|
# 用pyltp模型进行切句
|
||||||
|
def __cut_sentence(self, string):
|
||||||
|
"""@string contain many sentence"""
|
||||||
|
sents = SentenceSplitter.split(string) # 分句
|
||||||
|
sents = [sen for sen in sents if len(sen) > 4]
|
||||||
|
return sents, None
|
||||||
|
|
||||||
|
def __get_tokens(self, sentences):
|
||||||
|
sen_tokens = []
|
||||||
|
for i, sen in enumerate(sentences):
|
||||||
|
sen_tokens.append([])
|
||||||
|
words = jieba.cut(sen)
|
||||||
|
for wp in words:
|
||||||
|
if wp not in self.stopwords:
|
||||||
|
sen_tokens[i].append(wp)
|
||||||
|
return sen_tokens
|
||||||
|
|
||||||
|
# 获取文章主题
|
||||||
|
# 可以根据文章主题和摘要主题进行相似度计算,如果相似度过低,
|
||||||
|
# 可以重新调整各方面权重,重新提取摘要,单句进行主题对比LDA模型效果不好,词太少
|
||||||
|
def __theme_re_weight(self, tokens):
|
||||||
|
dictionary = Dictionary(tokens)
|
||||||
|
corpus = [dictionary.doc2bow(text) for text in tokens]
|
||||||
|
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, passes=20)
|
||||||
|
topic = []
|
||||||
|
topic.append(lda.show_topic(topicid=0, topn=8))
|
||||||
|
topic.append(lda.show_topic(topicid=1, topn=8))
|
||||||
|
return topic
|
||||||
|
|
||||||
|
def __knn_soft(self, sim):
|
||||||
|
window = 2
|
||||||
|
wight = np.array([0.1, 0.125, 0.5, 0.125, 0.1])
|
||||||
|
sim = [sim[0]] * window + sim + [sim[-1]] * window
|
||||||
|
sim = np.array(sim)
|
||||||
|
sim = [np.dot(sim[i - window:i + window + 1], wight)
|
||||||
|
for i in range(window, len(sim) - window)]
|
||||||
|
return sim
|
||||||
|
|
||||||
|
# 考虑标题的影响权重
|
||||||
|
def __title_re_weight(self, sim, sim_title):
|
||||||
|
sim = np.array(sim)
|
||||||
|
sim_title = np.array(sim_title)
|
||||||
|
p = 0.7
|
||||||
|
sim = p * sim + (1 - p) * sim_title
|
||||||
|
return list(sim)
|
||||||
|
|
||||||
|
# 考虑关键字对摘要的影响权重
|
||||||
|
def __keywords_re_weight(self, keywords, sim, tokens):
|
||||||
|
for wp in keywords:
|
||||||
|
for i, token in enumerate(tokens):
|
||||||
|
if wp.word in token:
|
||||||
|
sim[i] = sim[i] + 0.02 * wp.weight # 添加关键字的权重
|
||||||
|
return sim
|
||||||
|
|
||||||
|
# 考虑首位句子的影响权重
|
||||||
|
def __startend_re_weight(self, sents, sim):
|
||||||
|
if (len(sents[0]) > 20):
|
||||||
|
sim[0] = sim[0] + 0.1
|
||||||
|
return sim
|
||||||
|
|
||||||
|
def get_summrazation(self, string, num, title=None):
|
||||||
|
# sentences, flags = self.__split_sentence(string)
|
||||||
|
sentences, flags = self.__cut_sentence(string)
|
||||||
|
tokens = self.__get_tokens(sentences)
|
||||||
|
tokens_all = reduce(operator.add, tokens)
|
||||||
|
new_tokens = [tokens_all] + tokens
|
||||||
|
sim = self.Sen_Embedding.get_similarity_result(self.model_wv, new_tokens)
|
||||||
|
# test(sentences, sim) # testpoint
|
||||||
|
assert len(sim) == len(tokens)
|
||||||
|
keywords = self.__get_keyword(string)
|
||||||
|
# print(keywords)
|
||||||
|
# 根据关键字重新更新一次权值
|
||||||
|
sim = self.__keywords_re_weight(keywords, sim, tokens)
|
||||||
|
# test(sentences, sim) # testpoint
|
||||||
|
# 如果有标题,则根据标题更新一次权值
|
||||||
|
if title:
|
||||||
|
title_tokens = self.__get_tokens([title])
|
||||||
|
new_tokens = title_tokens + tokens
|
||||||
|
sim_title = self.Sen_Embedding.get_similarity_result(self.model_wv, new_tokens)
|
||||||
|
sim = self.__title_re_weight(sim, sim_title)
|
||||||
|
|
||||||
|
# 根据首尾位置更新一次权值
|
||||||
|
if self.position_re_weight:
|
||||||
|
sim = self.__startend_re_weight(sentences, sim)
|
||||||
|
# test(sentences, sim) # testpoint
|
||||||
|
|
||||||
|
sim = self.__knn_soft(sim) ##knn soft
|
||||||
|
# test(sentences, sim) # testpoint
|
||||||
|
|
||||||
|
assert len(sim) == len(tokens)
|
||||||
|
index = list(np.argsort(sim))
|
||||||
|
index = index[-num:] ##取值最高的num项
|
||||||
|
index.sort() ##排序
|
||||||
|
|
||||||
|
# 把标点也合并
|
||||||
|
abstract = []
|
||||||
|
if flags:
|
||||||
|
for i in index:
|
||||||
|
abstract.append(sentences[i])
|
||||||
|
abstract.append(flags[i])
|
||||||
|
else:
|
||||||
|
abstract = [sentences[i] for i in index]
|
||||||
|
|
||||||
|
topic = self.__theme_re_weight(tokens)
|
||||||
|
|
||||||
|
keywords = [(wp.word, wp.weight) for wp in keywords]
|
||||||
|
# for wp in keywords:
|
||||||
|
# result['keywords'].append({'cat': 'a', 'name': wp.word, 'value': 30, 'pro':wp.weight})
|
||||||
|
|
||||||
|
return ''.join(abstract), keywords, topic
|
||||||
|
|
||||||
|
|
||||||
|
def data_format(abstract, keywords, topic):
|
||||||
|
keywords = sorted(keywords, key=lambda x: x[1])
|
||||||
|
length_range = keywords[-1][1]
|
||||||
|
result = {}
|
||||||
|
result['keywords'] = []
|
||||||
|
for i, wp in enumerate(keywords):
|
||||||
|
result['keywords'].append({'cat': i,
|
||||||
|
'name': wp[0],
|
||||||
|
'value': round(10 + 50 * wp[1] / length_range, 2),
|
||||||
|
'pro': round(float(wp[1]), 4)})
|
||||||
|
result['summarization'] = abstract
|
||||||
|
topic_new = []
|
||||||
|
for tp in topic:
|
||||||
|
temp = []
|
||||||
|
for wp in tp:
|
||||||
|
temp.append({"name": wp[0], 'value': round(float(wp[1]), 4)})
|
||||||
|
topic_new.append(temp)
|
||||||
|
|
||||||
|
result['topics'] = topic_new
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
class My_Summrazation:
|
||||||
|
# 外部接口类,把本文件功能全部集成在该类
|
||||||
|
def __init__(self):
|
||||||
|
self.Summ = Summarization()
|
||||||
|
|
||||||
|
def get_results(self, text, num, title=None):
|
||||||
|
# try:
|
||||||
|
return data_format(*self.Summ.get_summrazation(text, num, title))
|
||||||
|
# except:
|
||||||
|
# return None
|
||||||
|
|
||||||
|
def release(self):
|
||||||
|
del self.Summ.model_wv
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pass
|
|
@ -0,0 +1,30 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
'''=================================================
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :Valuebai
|
||||||
|
@Date :2019/11/15 17:27
|
||||||
|
@Desc :
|
||||||
|
=================================================='''
|
||||||
|
import pkuseg
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
Title 标题
|
||||||
|
如果该文本有标题,那么标题可以帮助我们很多。
|
||||||
|
在之前,我们计算每个句子与文章整体的相似度是对每个子句与整体文章进行相似度距离计算,
|
||||||
|
那么,我们这个时候,就可以把标题的embedding结果拿出来,
|
||||||
|
那么每句话的相似度就是这句话与整体文章的相似度和标题的相似度的一个“结合”。
|
||||||
|
'''
|
||||||
|
|
||||||
|
title = 'some words'
|
||||||
|
content = 'more and more words'
|
||||||
|
sentene_vec_title = get_sentence_vec(title)
|
||||||
|
sentene_vec_content = get_sentence_vec(content)
|
||||||
|
#对于一个子句 sub_sen_n, 以前的similarity是 cosine(get_sentene_vec(sub_sen_n), sentene_vec_content)
|
||||||
|
#现在可以是
|
||||||
|
p = 0.5
|
||||||
|
sen_vec = get_sentene_vec(sub_sen_n)
|
||||||
|
similarity = p * cosine(sen_vec, sentene_vec_title) + (1 - p) * cosine(sen_vec, sentene_vec_content)
|
||||||
|
# 当然,这里的p以及p和cosine的构建都是可以变化的。 p 和 1-p是线性关系,可以是其他的关系。自己定即可
|
||||||
|
|
Binary file not shown.
276
README.md
276
README.md
|
@ -1,33 +1,40 @@
|
||||||
# Text-Auto-Summarization 文本自动摘要
|
# Text-Auto-Summarization 文本自动摘要
|
||||||
|
|
||||||

|

|
||||||

|

|
||||||

|
|
||||||

|

|
||||||
|
|
||||||
|
|
||||||
<p align="center">
|
## 项目展示
|
||||||
<!--快速在当前页面跳转的-->
|
|
||||||
<a href="#quick-start">Quick Start</a>
|
|
||||||
|
|
||||||
<a href="# projects">Projects</a> :•
|
|
||||||
<a href="## Textrank 和 Pagerank">Textrank 和 Pagerank</a> •
|
|
||||||
<a href="## 核心算法详解(采用Extraction)">核心算法详解</a> •
|
|
||||||
|
|
||||||
<a href="#deploy">Deploy</a> :•
|
http://111.229.74.215:8188/TextSummarization/
|
||||||
<a href="## Ptyhon创建虚拟环境">Ptyhon创建虚拟环境</a> •
|
|
||||||
<a href="## Requirements">Requirements</a>•
|
|
||||||
<a href="## linux部署指南">linux部署指南</a>•
|
|
||||||
<a href="## linux上杀死gunicorn的进程">linux上杀死gunicorn的进程</a>•
|
|
||||||
<a href="## linux根据端口号查找项目路径方法">linux根据端口号查找项目路径方法</a>•
|
|
||||||
|
|
||||||
<a href="# 前端页面">前端页面</a>
|
|
||||||
|
|
||||||
<!--a href="http://developers.tron.network">Documentation</a-->
|
|
||||||
<!--a href="#resource">Resource</a-->
|
|
||||||
</p>
|
|
||||||
|
|
||||||
# quick-start
|

|
||||||
|
|
||||||
|
|
||||||
|
## 项目部署
|
||||||
|
|
||||||
|
### 正常步骤
|
||||||
|
1. 将项目代码克隆到本地,git clone https://github.com/Valuebai/Text-Auto-Summarization.git
|
||||||
|
2. 安装python环境,本项目使用python3.6.5
|
||||||
|
3. 安装pip包,pip install -r requirements.txt
|
||||||
|
4. 运行项目看效果,python run.py
|
||||||
|
5. 项目运行正常,linux上可使用下面的命令让其在后台运行
|
||||||
|
```
|
||||||
|
sh run.sh 或者 . run.sh 运行
|
||||||
|
```
|
||||||
|
|
||||||
|
### 为Python项目创建独立的虚拟环境(可选)
|
||||||
|
[python在win/linux创建虚拟环境](https://blog.csdn.net/luhuibo318/article/details/94011917)
|
||||||
|
|
||||||
|
|
||||||
|
### 新购买的linux-CentOS7 云服务器部署指南
|
||||||
|
1. [【Linux】CentOS-常用命令&新购买云服务器安装必看](https://github.com/Valuebai/awesome-python-io/issues/1)
|
||||||
|
2. [总结python+flask项目在linux部署的五大方法](https://blog.csdn.net/luhuibo318/article/details/102688154)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
【核心技术】待整理进来 https://github.com/ZhiWenMo/Autosummarization_self_dis/blob/master/Autosummarization.ipynb
|
||||||
|
|
||||||
## 目前的摘要技术分为
|
## 目前的摘要技术分为
|
||||||
1. Extraction 抽取式
|
1. Extraction 抽取式
|
||||||
|
@ -236,231 +243,6 @@ jieba.initialize()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# deploy
|
|
||||||
|
|
||||||
## Ptyhon创建虚拟环境
|
|
||||||
|
|
||||||
### 方法一:自带命令
|
|
||||||
1. 进入文件夹目录
|
|
||||||
2. python -m venv -h 可查看帮助信息
|
|
||||||
3. 下面的
|
|
||||||
```
|
|
||||||
Linux运行命令行
|
|
||||||
$ 创建默认环境:python3 -m venv my_venv
|
|
||||||
$ 创建指定环境:python3.6 -m venv my_venv, python2 -m venv my_venv(添加到系统环境变量中)
|
|
||||||
$ 激活环境:. my_venv/bin/activate (. 或者 source )
|
|
||||||
$ 退出环境:deactivate
|
|
||||||
|
|
||||||
Windows系统运行cmd,使用 "py" Python 启动器命令配合 "-m" 开关选项:
|
|
||||||
$ 创建环境:py -3 -m venv my_venv (或者python -m venv my_venv)
|
|
||||||
$ 创建指定环境:py -3.6 -m venv my_venv, py -3.7 -m venv my_venv (添加到系统环境变量中)
|
|
||||||
$ 激活环境:my_venv\Scripts\activate.bat
|
|
||||||
$ 退出环境:deactivate
|
|
||||||
|
|
||||||
执行后,会在目录前方出现<my_venv>表明已进入虚拟环境
|
|
||||||
|
|
||||||
安装项目:
|
|
||||||
$ pip install -r requirements.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
### 方法二:Windows在PyCharm下创建虚拟环境
|
|
||||||
1. 安装并激活PyCharm
|
|
||||||
这个请自行安装
|
|
||||||
官方地址:https://www.jetbrains.com/pycharm/
|
|
||||||
|
|
||||||
2. 在PyCharm下创建虚拟环境
|
|
||||||
第一步:点击New Project
|
|
||||||
第二步:选择下图的New environment
|
|
||||||
第三步:点击create即可
|
|
||||||
pycharm会为新创建的项目自动建立一个虚拟环境
|
|
||||||
|
|
||||||
|
|
||||||
### 方法三:conda创建虚拟环境
|
|
||||||
|
|
||||||
[anaconda中的常用操作](https://blog.csdn.net/CampusAmour/article/details/83215524)
|
|
||||||
|
|
||||||
|
|
||||||
Linux下启动其终端命令行
|
|
||||||
$ source ~/anaconda3/bin/activate root
|
|
||||||
$ anaconda-navigator
|
|
||||||
|
|
||||||
- 创建虚拟环境,conda create -n env_name python=3.6
|
|
||||||
|
|
||||||
- 同时安装必要的包,conda create -n env_name numpy matplotlib python=3.6
|
|
||||||
|
|
||||||
- 激活虚拟环境
|
|
||||||
- Linux:source activate your_env_name(虚拟环境名称)
|
|
||||||
- Windows:activate your_env_name(虚拟环境名称)
|
|
||||||
|
|
||||||
- 退出虚拟环境:
|
|
||||||
- Linux:source deactivate your_env_name(虚拟环境名称)
|
|
||||||
- Windows:deactivate your_env_name(虚拟环境名称)
|
|
||||||
|
|
||||||
- 删除虚拟环境,conda remove -n your_env_name(虚拟环境名称) --all
|
|
||||||
- 删除包使用命令,conda remove --name $your_env_name $package_name(包名)
|
|
||||||
|
|
||||||
|
|
||||||
conda常用命令
|
|
||||||
- 查看已安装的包,conda list
|
|
||||||
- 安装包,conda install package_name(包名)
|
|
||||||
- 查看当前存在的虚拟环境,conda env list 或 conda info -e
|
|
||||||
- 检查更新当前conda,conda update conda
|
|
||||||
|
|
||||||
## Requirements
|
|
||||||
- 生成指南:
|
|
||||||
- 第一步:安装包 pip install pipreqs
|
|
||||||
- 第二步:在对应路径cmd,输入命令生成 requirements.txt文件:pipreqs ./ --encoding=utf8 --force 避免中文路径报错
|
|
||||||
- 第三步:下载该代码后直接pip install -r requirements.txt
|
|
||||||
- 或者创建虚拟环境安装
|
|
||||||
|
|
||||||
@[TOC](文章目录) #在CSDN自动生成目录
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## linux部署指南
|
|
||||||
### 1. linux sh & nohup后台运行python脚本
|
|
||||||
- 1)创建脚本vim run.sh
|
|
||||||
- 2)填写内容并保存:nohup python3 -u run.py > nohup.log 2>&1 &
|
|
||||||
- 3)运行:sh run.sh 或者 . run.sh
|
|
||||||
- 参考:[Linux sh、source和.命令执行.sh文件的区别](https://www.zengdongwu.com/article3.html) +
|
|
||||||
[linux后台执行命令:&和nohup](https://blog.csdn.net/liuyanfeier/article/details/62422742)
|
|
||||||
```md
|
|
||||||
- nohup : 就是不挂起的意思( no hang up),可以在你退出帐户之后继续运行相应的进程
|
|
||||||
- 使用&命令后,作业被提交到后台运行,当前控制台没有被占用,但是一但把当前控制台关掉(退出帐户时),作业就会停止运行。nohup命令可以在你退出帐户之后继续运行相应的进程。
|
|
||||||
- python3 -u run.py : 执行py文件
|
|
||||||
- -u的意思就是 uninterrupt不中断的意思,如果你的代码里边有sleep等线程沉睡相关的操作,如果你不-u的话 在后台 它就停住了
|
|
||||||
- > nohup.log : 重定向保存日志到当前路径下的nohup.log
|
|
||||||
- 2>&1 : 将标准出错也输出到nohup.log文件中
|
|
||||||
- & : 最后一个&, 是让该命令在后台执行。
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. 使用gunicorn 部署flask服务 (个人项目推荐使用这个)
|
|
||||||
- 1)创建脚本vim gunicorn.sh
|
|
||||||
- 2)填写内容并保存:
|
|
||||||
- conda activate just_do_it (在linux上创建好自己的环境,可选)
|
|
||||||
- nohup gunicorn -w 4 -b 0.0.0.0:8001 run:app & (不带日志)
|
|
||||||
- nohup gunicorn -w 4 -b 0.0.0.0:8001 run:app > gunicorn.log 2>&1 & (带日志)
|
|
||||||
|
|
||||||
- 3)运行:sh gunicorn.sh 或者 . gunicorn.sh
|
|
||||||
|
|
||||||
```md
|
|
||||||
需要提前pip install gunicorn
|
|
||||||
简单地,gunicorn可以通过gunicorn -w 4 -b 0.0.0.0:8001 run:app启动一个Flask应用。其中,
|
|
||||||
|
|
||||||
-w 4是指预定义的工作进程数为4,
|
|
||||||
-b 127.0.0.1:4000指绑定地址和端口
|
|
||||||
run是flask的启动python文件,app则是flask应用程序实例
|
|
||||||
|
|
||||||
其中run.py中文件的可能形式是:
|
|
||||||
# run.py
|
|
||||||
from flask import Flask
|
|
||||||
app = Flask(__name__)
|
|
||||||
|
|
||||||
参考文章:
|
|
||||||
gunicorn部署Flask服务 https://www.jianshu.com/p/fecf15ad0c9a
|
|
||||||
https://www.cnblogs.com/gaidy/p/9784919.html
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. 使用screen命令部署
|
|
||||||
- 第一步:screen -S yourname,新建一个叫yourname的session
|
|
||||||
- 第二步:python run.py,运行代码,关闭shell连接后还会一直在linux上跑
|
|
||||||
- 针对用户量小的情况,快速部署(本次使用这个)
|
|
||||||
- 关于screen,详情见:https://www.cnblogs.com/mchina/archive/2013/01/30/2880680.html
|
|
||||||
```
|
|
||||||
杀死所有命令的:ps aux|grep 你的进程名|grep -v grep | awk '{print $2}'|xargs kill -9
|
|
||||||
|
|
||||||
https://www.hutuseng.com/article/how-to-kill-all-detached-screen-session-in-linux
|
|
||||||
```
|
|
||||||
|
|
||||||
### 4. 使用flask + nginx + uwsgi (不建议,因Flask 与 uWsgi 结合有许多难以处理的 bug)
|
|
||||||
- 针对用户访问量大的情况,具体参考下面的文章
|
|
||||||
- https://blog.csdn.net/spark_csdn/article/details/80790929
|
|
||||||
- https://www.cnblogs.com/Ray-liang/p/4173923.html
|
|
||||||
- https://blog.csdn.net/daniel_ustc/article/details/9070357
|
|
||||||
|
|
||||||
### 5. 使用flask + nginx + gunicorn (大项目推荐使用这个)
|
|
||||||
- 生产环境很多大公司采用这个方式的,故推荐这个
|
|
||||||
- 因Flask 与 uWsgi 结合有许多难以处理的 bug,故推荐这个
|
|
||||||
- [Flask + Gunicorn + Nginx 部署](https://www.cnblogs.com/Ray-liang/p/4837850.html)
|
|
||||||
|
|
||||||
|
|
||||||
## linux上杀死gunicorn的进程
|
|
||||||
**方法一**
|
|
||||||
1. netstat -nltp | grep 8188
|
|
||||||
能看到类似下面的:
|
|
||||||
tcp 0 0 0.0.0.0:8188 0.0.0.0:* LISTEN 23422/gunicorn: mas
|
|
||||||
|
|
||||||
2. kill -9 23422(换成你的)
|
|
||||||
|
|
||||||
|
|
||||||
**方法二**
|
|
||||||
1. 获取Gunicorn进程树
|
|
||||||
```
|
|
||||||
pstree -ap|grep gunicorn
|
|
||||||
|
|
||||||
得到的结果如下
|
|
||||||
|
|
||||||
Python
|
|
||||||
| | |-grep,14519 --color=auto gunicorn
|
|
||||||
| -gunicorn,28097 /usr/local/bin/gunicorn query_site.wsgi:application -c ...
|
|
||||||
| |-gunicorn,14226 /usr/local/bin/gunicorn query_site.wsgi:application -c ...
|
|
||||||
| | |-{gunicorn},14229
|
|
||||||
| | |-{gunicorn},14230
|
|
||||||
...
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
2. 重启Gunicorn任务
|
|
||||||
|
|
||||||
kill -HUP 14226
|
|
||||||
|
|
||||||
3. 退出Gunicorn任务
|
|
||||||
|
|
||||||
kill -9 28097
|
|
||||||
|
|
||||||
|
|
||||||
## linux根据端口号查找项目路径方法
|
|
||||||
### 1. 只知道端口号
|
|
||||||
#### 方法一
|
|
||||||
|
|
||||||
**1. 根据端口号查询进程 ,比说6379**
|
|
||||||
|
|
||||||
```
|
|
||||||
netstat -lnp|grep 6379
|
|
||||||
```
|
|
||||||
|
|
||||||
**2. 根据进程号,查询寻程序路径**
|
|
||||||
```
|
|
||||||
ll /proc/2757
|
|
||||||
```
|
|
||||||
这样就找到了程序路径
|
|
||||||
|
|
||||||
#### 方法二
|
|
||||||
**1. 首先根据端口号查找进程**
|
|
||||||
```
|
|
||||||
netstat -nltp
|
|
||||||
或者
|
|
||||||
netstat -nltp | grep python
|
|
||||||
或者
|
|
||||||
netstat -apn |grep 10010
|
|
||||||
```
|
|
||||||
**2. 然后根据进程号去查找项目路径**
|
|
||||||
```
|
|
||||||
ps -ef |grep 8567
|
|
||||||
```
|
|
||||||
**3. 如果你第二步没有找到项目路径的话,尝试用**
|
|
||||||
```
|
|
||||||
lsof -p 8567
|
|
||||||
```
|
|
||||||
### 2. 如果知道项目部署在tomcat里
|
|
||||||
如果你的项目在linux 中是部署到tomcat容器里,可以输入下边的命令找到,如下:
|
|
||||||
```
|
|
||||||
ps anx|grep tomcat
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# 前端页面
|
# 前端页面
|
||||||
|
|
||||||
## Flask 快速完成前端页面
|
## Flask 快速完成前端页面
|
||||||
|
|
Binary file not shown.
File diff suppressed because one or more lines are too long
2
run.py
2
run.py
|
@ -47,3 +47,5 @@ if __name__ == "__main__":
|
||||||
logger.info('is_dev_mode:{}'.format(is_dev_mode))
|
logger.info('is_dev_mode:{}'.format(is_dev_mode))
|
||||||
# main run
|
# main run
|
||||||
app.run(host='0.0.0.0', port=8188)
|
app.run(host='0.0.0.0', port=8188)
|
||||||
|
|
||||||
|
|
||||||
|
|
Binary file not shown.
After Width: | Height: | Size: 81 KiB |
Binary file not shown.
After Width: | Height: | Size: 110 KiB |
|
@ -22,8 +22,11 @@
|
||||||
<body>
|
<body>
|
||||||
<H1 align="center">Welcome to my Home</H1>
|
<H1 align="center">Welcome to my Home</H1>
|
||||||
<div style="text-align: center;">
|
<div style="text-align: center;">
|
||||||
<img src="https://user-images.githubusercontent.com/9695113/58942751-f4a5a100-87b0-11e9-9116-915f85c5f65a.jpg"
|
<img src="./static/images/home.jpg"
|
||||||
alt="上海鲜花港 - 郁金香"/>
|
alt="god bless you"
|
||||||
|
width="600px"
|
||||||
|
height="400px"
|
||||||
|
/>
|
||||||
</div>
|
</div>
|
||||||
<div class="absoluteCenter">
|
<div class="absoluteCenter">
|
||||||
<H3 align="center">耶和华是我的牧者,我必不致缺乏。</H3>
|
<H3 align="center">耶和华是我的牧者,我必不致缺乏。</H3>
|
||||||
|
|
Loading…
Reference in New Issue