Text-Auto-Summarization/APP/TextSummarization/refer_multi_feature_sum.py

161 lines
7.3 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''=================================================
@IDE PyCharm
@Author LuckyHuibo
@Date 2019/10/24 18:16
@Desc
=================================================='''
import jieba
import numpy as np
import collections
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
def split_sentence(text, punctuation_list='!?。!?'):
"""
将文本段安装标点符号列表里的符号切分成句子,将所有句子保存在列表里。
"""
sentence_set = []
inx_position = 0 # 索引标点符号的位置
char_position = 0 # 移动字符指针位置
for char in text:
char_position += 1
if char in punctuation_list:
next_char = list(text[inx_position:char_position + 1]).pop()
if next_char not in punctuation_list:
sentence_set.append(text[inx_position:char_position])
inx_position = char_position
if inx_position < len(text):
sentence_set.append(text[inx_position:])
sentence_with_index = {i: sent for i, sent in
enumerate(sentence_set)} # dict(zip(sentence_set, range(len(sentences))))
return sentence_set, sentence_with_index
def get_tfidf_matrix(sentence_set, stop_word):
corpus = []
for sent in sentence_set:
sent_cut = jieba.cut(sent)
sent_list = [word for word in sent_cut if word not in stop_word]
sent_str = ' '.join(sent_list)
corpus.append(sent_str)
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
# word=vectorizer.get_feature_names()
tfidf_matrix = tfidf.toarray()
return np.array(tfidf_matrix)
def get_sentence_with_words_weight(tfidf_matrix):
sentence_with_words_weight = {}
for i in range(len(tfidf_matrix)):
sentence_with_words_weight[i] = np.sum(tfidf_matrix[i])
max_weight = max(sentence_with_words_weight.values()) # 归一化
min_weight = min(sentence_with_words_weight.values())
for key in sentence_with_words_weight.keys():
x = sentence_with_words_weight[key]
sentence_with_words_weight[key] = (x - min_weight) / (max_weight - min_weight)
return sentence_with_words_weight
def get_sentence_with_position_weight(sentence_set):
sentence_with_position_weight = {}
total_sent = len(sentence_set)
for i in range(total_sent):
sentence_with_position_weight[i] = (total_sent - i) / total_sent
return sentence_with_position_weight
def similarity(sent1, sent2):
"""
计算余弦相似度
"""
return np.sum(sent1 * sent2) / 1e-6 + (np.sqrt(np.sum(sent1 * sent1)) * \
np.sqrt(np.sum(sent2 * sent2)))
def get_similarity_weight(tfidf_matrix):
sentence_score = collections.defaultdict(lambda: 0.)
for i in range(len(tfidf_matrix)):
score_i = 0.
for j in range(len(tfidf_matrix)):
score_i += similarity(tfidf_matrix[i], tfidf_matrix[j])
sentence_score[i] = score_i
max_score = max(sentence_score.values()) # 归一化
min_score = min(sentence_score.values())
for key in sentence_score.keys():
x = sentence_score[key]
sentence_score[key] = (x - min_score) / (max_score - min_score)
return sentence_score
def ranking_base_on_weigth(sentence_with_words_weight,
sentence_with_position_weight,
sentence_score, feature_weight=[1, 1, 1]):
sentence_weight = collections.defaultdict(lambda: 0.)
for sent in sentence_score.keys():
sentence_weight[sent] = feature_weight[0] * sentence_with_words_weight[sent] + \
feature_weight[1] * sentence_with_position_weight[sent] + \
feature_weight[2] * sentence_score[sent]
sort_sent_weight = sorted(sentence_weight.items(), key=lambda d: d[1], reverse=True)
return sort_sent_weight
def get_summarization(sentence_with_index, sort_sent_weight, topK_ratio=0.3):
topK = int(len(sort_sent_weight) * topK_ratio)
print('topK:{0}'.format(topK))
summarization_sent = sorted([sent[0] for sent in sort_sent_weight[:topK]])
summarization = []
for i in summarization_sent:
summarization.append(sentence_with_index[i])
summary = ''.join(summarization)
return summary
if __name__ == '__main__':
# test_text = '../../data/training17.txt'
# with open(test_text, 'r', encoding='utf-8') as f:
# text = f.read()
text = '''网易娱乐7月21日报道 林肯公园主唱查斯特·贝宁顿Chester Bennington于今天早上在洛杉矶帕洛斯弗迪斯的一个私人庄园自缢身亡年仅41岁。此消息已得到洛杉矶警方证实。
  洛杉矶警方透露Chester的家人正在外地度假Chester独自在家上吊地点是家里的二楼。一说是一名音乐公司工作人员来家里找他时发现了尸体也有人称是佣人最早发现其死亡。
  林肯公园另一位主唱麦克·信田确认了Chester Bennington自杀属实并对此感到震惊和心痛称稍后官方会发布声明。Chester昨天还在推特上转发了一条关于曼哈顿垃圾山的新闻。粉丝们纷纷在该推文下留言不相信Chester已经走了。
  外媒猜测Chester选择在7月20日自杀的原因跟他极其要好的朋友、Soundgarden(声音花园)乐队以及Audioslave乐队主唱Chris Cornell有关因为7月20日是Chris Cornell的诞辰。而Chris Cornell于今年5月17日上吊自杀享年52岁。Chris去世后Chester还为他写下悼文。
  对于Chester的自杀亲友表示震惊但不意外因为Chester曾经透露过想自杀的念头他曾表示自己童年时被虐待导致他医生无法走出阴影也导致他长期酗酒和嗑药来疗伤。目前洛杉矶警方仍在调查Chester的死因。
  据悉Chester与毒品和酒精斗争多年年幼时期曾被成年男子性侵导致常有轻生念头。Chester生前有过2段婚姻育有6个孩子。
  林肯公园在今年五月发行了新专辑《多一丝曙光One More Light》成为他们第五张登顶Billboard排行榜的专辑。而昨晚刚刚发布新单《Talking To Myself》MV。'''
stop_word = []
# 这个停用词表增加了很多中文的
with open('../../data/stopWordList.txt', 'r', encoding='utf-8') as f:
for line in f.readlines():
stop_word.append(line.strip())
sentence_set, sentence_with_index = split_sentence(text, punctuation_list='!?。!?')
tfidf_matrix = get_tfidf_matrix(sentence_set, stop_word)
sentence_with_words_weight = get_sentence_with_words_weight(tfidf_matrix)
sentence_with_position_weight = get_sentence_with_position_weight(sentence_set)
sentence_score = get_similarity_weight(tfidf_matrix)
sort_sent_weight = ranking_base_on_weigth(sentence_with_words_weight,
sentence_with_position_weight,
sentence_score, feature_weight=[1, 1, 1])
summarization = get_summarization(sentence_with_index, sort_sent_weight, topK_ratio=0.3)
print('summarization:\n', summarization)