-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathsummarizer.js
120 lines (93 loc) · 3.02 KB
/
summarizer.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
'use strict';
const {
SentenceTokenizer,
WordTokenizer,
TfIdf,
Lexicon,
RuleSet,
BrillPOSTagger,
} = require('natural');
const stopword = require('stopword');
function summarizeText(text) {
text = text.trim();
text = text.replace(/\s/g, ' ');
const sentTokenizer = new SentenceTokenizer();
const sentences = sentTokenizer.tokenize(text);
const tfidfScores = calculateTFIDFScore(sentences);
const sentencesScore = calculateSentencesScore(tfidfScores);
const averageSentenceScore = calculateAverageSentenceScore(sentencesScore);
let summary = new String();
for (const sentence in sentencesScore) {
if (sentencesScore[sentence] >= averageSentenceScore)
summary += ` ${sentences[sentence]}`;
}
summary = summary.trim();
return summary;
}
function calculateTFIDFScore(sentences) {
const tfidf = new TfIdf();
const tfidfScores = new Object();
sentences.forEach(sentence => tfidf.addDocument(sentence));
sentences.forEach((sentence, i) => {
const words = wordPreprocessing(sentence);
const wordsScore = new Object();
words.forEach(word => (wordsScore[word] = tfidf.tfidf(word, i)));
tfidfScores[i] = wordsScore;
});
return tfidfScores;
}
function wordPreprocessing(sentence) {
const wordTokenizer = new WordTokenizer();
const tagger = new BrillPOSTagger(
new Lexicon('EN', 'N', 'NNP'),
new RuleSet('EN')
);
let words = wordTokenizer.tokenize(sentence);
words = stopword.removeStopwords(words);
words = Array.from(new Set(words));
words = words.map(word => word.replace(/[0-9]/g, ''));
words = words.map(word => word.replace(/_/g, ''));
const posTaggedWords = tagger.tag(words).taggedWords;
words = posTaggedWords.map(word => {
if (
[
'NN',
'NNP',
'NNS',
'VB',
'VBD',
'VBG',
'VBN',
'VBP',
'VBZ',
].includes(word.tag)
) {
return word.token;
} else {
return '';
}
});
words = words.filter(word => word.length > 1);
words = words.map(word => word.toLowerCase());
return words;
}
function calculateSentencesScore(tfidfScores) {
const sentencesScore = new Object();
for (const i in tfidfScores) {
let score = 0;
const sentence = tfidfScores[i];
const wordCount = Object.keys(sentence).length;
for (const word in sentence) score += sentence[word];
const totalScore = score / wordCount;
if (totalScore) sentencesScore[i] = score / wordCount;
else sentencesScore[i] = 0;
}
return sentencesScore;
}
function calculateAverageSentenceScore(sentencesScore) {
let score = 0;
const count = Object.keys(sentencesScore).length;
for (const sentence in sentencesScore) score += sentencesScore[sentence];
return score / count;
}
module.exports = summarizeText;