优化的EmEditor宏可对元素进行排序，拆分和重复数据删除单个列并提取计数

如何解决优化的EmEditor宏可对元素进行排序，拆分和重复数据删除单个列并提取计数

我目前有一个单独的文件，使用这种格式（3列标签“ \ t”分开）和“;”分隔列中的所有元素。

    COL1\tCOL2\tCOL3
    abc\t123;1q\tapple\t
    dfg\t234;2w\tapple;apple\t
    hij\t345;3e\tbanana;apple;cherry;\t
    klm\t456;4r\tapple;banana;cherry;banana;cherry;\t
    nop\t567;5t\t;;apple;banana;cherry;banana;;cherry;;\t

我想对优化的宏（最好是javascript）进行任何处理，以操纵文件以输出此内容：现在对第3列进行排序（所有多余/不需要的定界符也将被删除），重复项将被删除。新列4是重复数据删除的元素计数。

    abc\t123;1q\tapple\t1
    dfg\t234;2w\tapple\t1
    hij\t345;3e\tapple;banana;cherry\t3
    klm\t456;4r\tapple;banana;cherry\t3
    nop\t567;5t\tapple;banana;cherry\t3

我一直在尝试与下面类似的方法，但是我认为这种方法可能会更快。

    for( iRow = 2; iRow <= totalLines; iRow++ ) { //traverse eash row,start at 2nd row
      str = document.GetCell(iRow,2,eeCellIncludeQuotes);
      var count = (str.match(/;/g) || []).length;
      var numOfElements = count + 1;
      document.SetCell( iRow,3,numOfElements,eeAutoQuote );
    }

因此，用户应选择要在其上运行此列的列（在本示例中为第3列），并且宏将仅在该列上运行，然后将计数输出到右侧的新列。

真正的源文件将是数百万行，因此，如果可以通过任何方式对EmEditor进行优化，那就太好了。

解决方法

我通过创建一个函数来对字符串中的分号进行计数而不是使用正则表达式来优化宏（第二版），并且还使用了GetColumn和SetColumn方法来提高速度（第三版））。第三个版本将插入一列，而不是覆盖现有列。

原始宏（针对正确性和时机进行了修改）

var start = new Date().getTime();

var totalLines = document.GetLines();
for( iRow = 2; iRow <= totalLines; iRow++ ) { //traverse eash row,start at 2nd row
    str = document.GetCell(iRow,3,eeCellIncludeQuotes);
    var count = (str.match(/;/g) || []).length;
    var numOfElements = count + 1;
    document.SetCell( iRow,4,numOfElements,eeAutoQuote );
}

var end = new Date().getTime();
var time = end - start;
alert( "Execution time: " + time + " ms" );

第二版

function CountSemiColon( str )
{
    var count = 0;
    for( var index = -1; ; ) {
        index = str.indexOf( ';',index + 1 );
        if( index == -1 ) {
            break;
        }
        ++count;
    }
    return count;
}

var start = new Date().getTime();

var totalLines = document.GetLines();
for( iRow = 2; iRow <= totalLines; iRow++ ) { //traverse eash row,start at 2nd row
    var str = document.GetCell(iRow,eeCellIncludeQuotes);
    document.SetCell( iRow,CountSemiColon( str ) + 1,eeAutoQuote );
}

var end = new Date().getTime();
var time = end - start;
alert( "Execution time: " + time + " ms" );

第三版

function CountSemiColon( str )
{
    var count = 0;
    for( var index = -1; ; ) {
        index = str.indexOf( ';',index + 1 );
        if( index == -1 ) {
            break;
        }
        ++count;
    }
    return count;
}

var start = new Date().getTime();

var totalLines = document.GetLines();
s1 = document.GetColumn( 3,"\n",eeCellIncludeQuotesAndDelimiter,2,totalLines - 1 );
sLines = s1.split( "\n" );
s2 = "";
nTotal = sLines.length;
for( y = 0; y < nTotal; y++ ) {
    s2 += CountSemiColon( sLines[y] ) + 1 + "\n";
}
x = s2.length;
if( x > 0 ) s2 = s2.substr( 0,x - 1 );
document.InsertColumn( 4,s2,eeDontQuote,2 );

var end = new Date().getTime();
var time = end - start;
alert( "Execution time: " + time + " ms" );

第四版（对于空白单元格，返回0）

function CountElements( str )
{
    if( str.length == 0 || str == '\t' ) {   // if empty string or delimiter only,return 0
        return 0;
    }
    var count = 0;
    for( var index = -1; ; ) {
        index = str.indexOf( ';',index + 1 );
        if( index == -1 ) {
            break;
        }
        ++count;
    }
    return count + 1;   // add 1 to the Count
}

var start = new Date().getTime();

var totalLines = document.GetLines();
s1 = document.GetColumn( 3,totalLines - 1 );
sLines = s1.split( "\n" );
s2 = "";
nTotal = sLines.length;
for( y = 0; y < nTotal; y++ ) {
    s2 += CountElements( sLines[y] ) + "\n";
}
x = s2.length;
if( x > 0 ) s2 = s2.substr( 0,2 );

var end = new Date().getTime();
var time = end - start;
alert( "Execution time: " + time + " ms" );

测试结果：

10429毫秒
8496毫秒
1803毫秒
1890毫秒

100万行，52 MB CSV文件。

如果这还不够快，或者发生“内存不足”错误，我会考虑其他方法或进行进一步优化，所以请告诉我。

优化的EmEditor宏可对元素进行排序，拆分和重复数据删除单个列并提取计数

如何解决优化的EmEditor宏可对元素进行排序，拆分和重复数据删除单个列并提取计数

解决方法

相关推荐