微信公众号搜"智元新知"关注
微信扫一扫可直接关注哦!

javascript – 从文本中提取关键短语(1-4个字ngram)

从文本块中提取关键短语的最佳方式是什么?我正在编写一个关键字提取工具: something like this.我发现了一些Python和Perl的库来提取n-gram,但是我在Node中写这个,所以我需要一个JavaScript解决方案.如果没有任何现有的JavaScript库,有人可以解释如何做到这一点,所以我可以自己写?

解决方法

我喜欢这个想法,所以我实现了它:见下面(包括描述性意见).
预览时间: http://fiddle.jshell.net/WsKMx/
/*@author Rob W,created on 16-17 September 2011,on request for Stackoverflow (https://stackoverflow.com/q/7085454/938089)
 * Modified on 17 juli 2012,fixed IE bug by replacing [,] with [null]
 * This script will calculate words. For the simplicity and efficiency,* there's only one loop through a block of text.
 * A 100% accuracy requires much more computing power,which is usually unnecessary
 **/


var text = "A quick brown fox jumps over the lazy old bartender who said 'Hi!' as a response to the visitor who presumably assaulted the maid's brother,because he didn't pay his debts in time. In time in time does really mean in time. Too late is too early? Nonsense! 'Too late is too early' does not make any sense.";

var atLeast = 2;       // Show results with at least .. occurrences
var numWords = 5;      // Show statistics for one to .. words
var ignoreCase = true; // Case-sensitivity
var REallowedChars = /[^a-zA-Z'\-]+/g;
 // RE pattern to select valid characters. Invalid characters are replaced with a whitespace

var i,j,k,textlen,len,s;
// Prepare key hash
var keys = [null]; //"keys[0] = null",a word boundary with length zero is empty
var results = [];
numWords++; //for human logic,we start counting at 1 instead of 0
for (i=1; i<=numWords; i++) {
    keys.push({});
}

// Remove all irrelevant characters
text = text.replace(REallowedChars," ").replace(/^\s+/,"").replace(/\s+$/,"");

// Create a hash
if (ignoreCase) text = text.toLowerCase();
text = text.split(/\s+/);
for (i=0,textlen=text.length; i<textlen; i++) {
    s = text[i];
    keys[1][s] = (keys[1][s] || 0) + 1;
    for (j=2; j<=numWords; j++) {
        if(i+j <= textlen) {
            s += " " + text[i+j-1];
            keys[j][s] = (keys[j][s] || 0) + 1;
        } else break;
    }
}

// Prepares results for advanced analysis
for (var k=1; k<=numWords; k++) {
    results[k] = [];
    var key = keys[k];
    for (var i in key) {
        if(key[i] >= atLeast) results[k].push({"word":i,"count":key[i]});
    }
}

// Result parsing
var outputHTML = []; // Buffer data. This data is used to create a table using `.innerHTML`

var f_sortAscending = function(x,y) {return y.count - x.count;};
for (k=1; k<numWords; k++) {
    results[k].sort(f_sortAscending);//sorts results

    // Customize your output. For example:
    var words = results[k];
    if (words.length) outputHTML.push('<td colSpan="3" class="num-words-header">'+k+' word'+(k==1?"":"s")+'</td>');
    for (i=0,len=words.length; i<len; i++) {

        //Characters have been validated. No fear for XSS
        outputHTML.push("<td>" + words[i].word + "</td><td>" +
           words[i].count + "</td><td>" +
           Math.round(words[i].count/textlen*10000)/100 + "%</td>");
           // textlen defined at the top
           // The relative occurence has a precision of 2 digits.
    }
}
outputHTML = '<table id="wordAnalysis"><thead><tr>' +
              '<td>Phrase</td><td>Count</td><td>Relativity</td></tr>' +
              '</thead><tbody><tr>' +outputHTML.join("</tr><tr>")+
               "</tr></tbody></table>";
document.getElementById("RobW-sample").innerHTML = outputHTML;
/*
CSS:
#wordAnalysis td{padding:1px 3px 1px 5px}
.num-words-header{font-weight:bold;border-top:1px solid #000}

HTML:
<div id="#RobW-sample"></div>
*/

原文地址:https://www.jb51.cc/js/150529.html

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。

相关推荐