File size: 6,462 Bytes
5120311 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
import json
import time
from .utils import get_sbert_embedding, clean_text
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
from nltk import sent_tokenize
import requests
# from clean_text import normalize_text
MAX_LENGTH_FEATURE = 250
MIN_LENGTH_FEATURE = 100
URL_CHECK_SPAM = "http://10.9.3.70:30036/predict"
def check_spam(docs):
json_body = {
"domain_id": "",
"records": [
{
"text": doc.get("message",""),
"idxcol": 1
} for doc in docs
]
}
result = requests.post(URL_CHECK_SPAM, json = json_body).json()
docs = [x for i,x in enumerate(docs) if result[i]["label"] == 0]
return docs
def preocess_feature(doc):
message = doc.get("message","")
paras = message.split("\n")
feature = ""
paras = [clean_text(x.strip(), normalize=False) for x in paras if x.strip() and len(x.strip()) > 10]
for para in paras:
if len(feature) + len(para) < MAX_LENGTH_FEATURE:
feature += " " +para
elif len(feature) < MIN_LENGTH_FEATURE:
sens = sent_tokenize(para)
for sen in sens:
if len(feature) + len(sen) < MAX_LENGTH_FEATURE or len(feature.strip()) < MIN_LENGTH_FEATURE:
feature += " " +sen
return feature
def topic_clustering(docs, distance_threshold, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster=50, delete_message=True, is_check_spam = True):
# global model, model_en
docs = [x for x in docs if len(x.get("message","")) > 100]
docs = docs[:30000]
if is_check_spam:
docs = check_spam(docs)
result = {}
cluster_score = {}
t1 = time.time()
if len(docs) < 1:
return result
elif len(docs) == 1:
return {
"0": docs
}
# features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
f_docs = []
for x in docs:
ft = preocess_feature(x)
if len(ft) > MIN_LENGTH_FEATURE:
x["title"] = ft
f_docs.append(x)
docs = f_docs
features = [x["title"] for x in docs ]
# with open("feature", 'w') as f:
# json.dump(features, f, ensure_ascii = False)
# print(features)
vectors = get_sbert_embedding(features)
clusteror = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='cosine',
linkage='complete', distance_threshold=distance_threshold)
clusteror.fit(vectors)
print(f"Time encode + clustering: {time.time() - t1} {clusteror.n_clusters_}")
for i in range(clusteror.n_clusters_):
result[str(i + 1)] = []
cluster_score[str(i + 1)] = 0
for i in range(len(clusteror.labels_)):
cluster_no = clusteror.labels_[i]
if docs[i].get('domain','') not in ["cungcau.vn","baomoi.com","news.skydoor.net"]:
response_doc = {}
response_doc = docs[i]
score = response_doc.get('score', 0)
if not docs[i].get('message','').strip():
continue
if score > cluster_score[str(cluster_no + 1)]:
cluster_score[str(cluster_no + 1)] = score
if 'domain' in docs[i]:
response_doc['domain'] = docs[i]['domain']
if 'url' in docs[i]:
response_doc['url'] = docs[i]['url']
if 'title' in docs[i]:
response_doc['title'] = clean_text(docs[i]['title'])
if 'snippet' in docs[i]:
response_doc['snippet'] = clean_text(docs[i]['snippet'])
if 'created_time' in docs[i]:
response_doc['created_time'] = docs[i]['created_time']
if "sentiment" in docs[i]:
response_doc['sentiment'] = docs[i]['sentiment']
if 'message' in docs[i]:
title = docs[i].get('title','')
snippet = docs[i].get('snippet','')
message = docs[i].get('message','')
# if title.strip():
# split_mess = message.split(title)
# if len(split_mess) > 1:
# message = title.join(split_mess[1:])
# if snippet.strip():
# split_mess = message.split(snippet)
# if len(split_mess) > 1:
# message = snippet.join(split_mess[1:])
response_doc['message'] = clean_text(message)
if 'id' in docs[i]:
response_doc['id'] = docs[i]['id']
# response_doc['score'] = 0.0
# response_doc['title_summarize'] = []
# response_doc['content_summary'] = ""
# response_doc['total_facebook_viral'] = 0
result[str(cluster_no + 1)].append(response_doc)
empty_clus_ids = []
for x in result:
result[x] = sorted(result[x], key=lambda i: -len(i.get('message','')))
if len( result[x]) > 0:
# if len(result[x]) > 1:
# result[x] = check_duplicate_title_domain(result[x])
result[x][0]['num_docs'] = len(result[x])
result[x][0]['max_score'] = cluster_score[x]
else:
empty_clus_ids.append(x)
for x in empty_clus_ids:
result.pop(x,None)
result = dict( sorted(result.items(), key=lambda i: -len(i[1]))[:top_cluster])
return result
# return post_processing(result, top_cluster=top_cluster, top_sentence=top_sentence, topn_summary=topn_summary, sorted_field = sorted_field, max_doc_per_cluster=max_doc_per_cluster, delete_message=delete_message)
if __name__ == '__main__':
# with open("/home2/vietle/DA-Report/social.json", 'r') as f:
# docs = json.load(f)[:2000]
with open("/home2/vietle/news-cms/topic_summarization/data/news_cms.social.json", 'r') as f:
docs = json.load(f)[:10000]
clusters = topic_clustering(docs, distance_threshold=0.2, top_cluster=5000, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster=50, delete_message=False)
with open("/home2/vietle/news-cms/topic_summarization/cluster/news_cms.social.json", 'w') as f:
json.dump(clusters,f, ensure_ascii =False) |