C / C ++中的快速字符串标记化

如何解决C / C ++中的快速字符串标记化

| 我正在使用C / C ++应用程序（在Visual Studio 2010中），在这里我需要标记以逗号分隔的字符串，并且我希望这样做尽快。目前，我正在使用strtok_s。我对strtok_s和sscanf进行了一些测试，似乎strtok_s更快（除非我写了一个糟糕的实现:)），但我想知道是否有人可以建议一个更快的选择。

解决方法

对于纯粹的运行速度，boost.spirit.qi是一个很好的选择。 ,最好的办法是确保只对字符串进行一次遍历，并即时生成输出。开始将char提取到临时缓冲区中，当遇到定界符时，将临时缓冲区保存到输出集合中，清除临时缓冲区，冲洗并重复。这是执行此操作的基本实现。

template<class C=char>
struct basic_token
{
    typedef std::basic_string<C> token_string;
    typedef unsigned long size_type;
    token_string token_,delim_;
    basic_token(const token_string& token,const token_string& delim = token_string());
};

template<class C>
basic_token<C>::basic_token(const token_string& token,const token_string& delim)
:   token_(token),delim_(delim)
{
}

typedef basic_token<char> token;

template<class Char,class Iter> void tokenize(const std::basic_string<Char>& line,const Char* delims,Iter itx)
{
    typedef basic_token<Char> Token;
    typedef std::basic_string<Char> TString;

    for( TString::size_type tok_begin = 0,tok_end = line.find_first_of(delims,tok_begin);
        tok_begin != TString::npos; tok_end = line.find_first_of(delims,tok_begin) )
    {
        if( tok_end == TString::npos )
        {
            (*itx++) = Token(TString(&line[tok_begin]));
            tok_begin = tok_end;
        }
        else
        {
            (*itx++) = Token(TString(&line[tok_begin],&line[tok_end]),TString(1,line[tok_end]));
            tok_begin = tok_end + 1;
        }
    }
}

template<class Char,class Iter> void tokenize(const Char* line,const Char* delim,Iter itx)
{
    tokenize(std::basic_string<Char>(line),delim,itx);
}
template<class Stream,class Token> Stream& operator<<(Stream& os,const Token& tok)
{
    os << tok.token_ << \"\\t[\" << tok.delim_ << \"]\";
    return os;
}

...您将这样使用：

string raw = \"35=BW|49=TEST|1346=REQ22|1355=2|1182=88500|1183=88505|10=087^\";
vector<stoken> tokens;
tokenize(raw,\"|\",back_inserter(tokens));
copy(tokens.begin(),tokens.end(),ostream_iterator<stoken>(cout,\"\\n\"));

输出为：

35=BW   [|]
49=TEST [|]
1346=REQ22      [|]
1355=2  [|]
1182=88500      [|]
1183=88505      [|]
10=087^ []

,我想提醒您，strtok及其同类产品存在风险这样您可以获得的令牌数可能会与您想要的不同。

one|two|three  would yield 3 tokens

而

one|||three    would yield 2.

,mmhmm的测试没有正确利用精神，他的语法是缺陷。

#include <cstdio> 
#include <cstring>   

#include <iostream>
#include <string>

#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/fusion/include/io.hpp>

#include <boost/spirit/include/qi.hpp>    

/****************************strtok_r************************/
typedef struct sTokenDataC {
    char *time;
    char *symb;
    float bid;
    float ask;
    int bidSize;
    int askSize;
} tokenDataC;

tokenDataC parseTick( char *line,char *parseBuffer )
{
    tokenDataC tokenDataOut;

    tokenDataOut.time = strtok_r( line,\",&parseBuffer );
    tokenDataOut.symb = strtok_r( nullptr,&parseBuffer );
    tokenDataOut.bid = atof(strtok_r( nullptr,&parseBuffer ));
    tokenDataOut.ask = atof(strtok_r( nullptr,&parseBuffer ));
    tokenDataOut.bidSize = atoi(strtok_r( nullptr,&parseBuffer ));
    tokenDataOut.askSize = atoi(strtok_r( nullptr,&parseBuffer  ));

    return tokenDataOut;
}

void test_strcpy_s(int iteration)
{
    char *testStringC = new char[64];
    char *lineBuffer = new char[64];

    printf(\"test_strcpy_s....\\n\");
    strcpy(testStringC,\"09:30:00,TEST,13.24,15.32,10,14\");
    {
        timeEstimate<> es;
        tokenDataC tokenData2;
        for(int i = 0; i < iteration; i++)
        {
            strcpy(lineBuffer,testStringC);//this is more realistic since this has to happen because I want to preserve the line
            tokenData2 = parseTick(lineBuffer,testStringC);
            //std::cout<<*tokenData2.time<<\",\"<<*tokenData2.symb<<\",\";
            //std::cout<<tokenData2.bid<<\",\"<<tokenData2.ask<<\",\"<<tokenData2.bidSize<<\",\"<<tokenData2.askSize<<std::endl;
        }
    }

    delete[] lineBuffer;
    delete[] testStringC;
}
/****************************strtok_r************************/

/****************************spirit::qi*********************/
namespace qi = boost::spirit::qi;

struct tokenDataCPP
{
    std::string time;
    std::string symb;
    float bid;
    float ask;
    int bidSize;
    int askSize;

    void clearTimeSymb(){
        time.clear();
        symb.clear();
    }
};

BOOST_FUSION_ADAPT_STRUCT(
        tokenDataCPP,(std::string,time)
        (std::string,symb)
        (float,bid)
        (float,ask)
        (int,bidSize)
        (int,askSize)
        )

void test_spirit_qi(int iteration)
{
    std::string const strs(\"09:30:00,14\");
    tokenDataCPP data;        

    auto myString = *~qi::char_(\",\");
    auto parser = myString >> \",\" >> myString >> \",\" >> qi::float_ >> \",\" >> qi::int_  >> \",\" >> qi::int_;
    {
        std::cout<<(\"test_spirit_qi....\\n\");
        timeEstimate<> es;
        for(int i = 0; i < iteration; ++i){
            qi::parse(std::begin(strs),std::end(strs),parser,data);
            //std::cout<<data.time<<\",\"<<data.symb<<\",\";
            //std::cout<<data.bid<<\",\"<<data.ask<<\",\"<<data.bidSize<<\",\"<<data.askSize<<std::endl;
            data.clearTimeSymb();
        }
    }
}
/****************************spirit::qi*********************/

int main()
{
    int const ITERATIONS = 500 * 10000;
    test_strcpy_s(ITERATIONS);
    test_spirit_qi(ITERATIONS);
}

由于clang ++没有strtok_s，我使用strtok_r替换它迭代500 * 10k，时间是 test_strcpy_s：1.40951 test_spirit_qi：1.34277 他们的时代几乎相同，相差无几。编译器，clang ++ 3.2，-O2 时间码 ,这应该相当快，没有临时缓冲区，它也分配空的代币。

template <class char_t,class char_traits_t,class char_allocator_t,class string_allocator_t>
inline void _tokenize(
    const std::basic_string<char_t,char_traits_t,char_allocator_t>& _Str,const char_t& _Tok,std::vector<std::basic_string<char_t,char_allocator_t>,string_allocator_t>& _Tokens,const size_t& _HintSz=10)
{
    _Tokens.reserve(_HintSz);

    const char_t* _Beg(&_Str[0]),*_End(&_Str[_Str.size()]); 

    for (const char_t* _Ptr=_Beg; _Ptr<_End; ++_Ptr)
    {
        if (*_Ptr == _Tok)
        {
            _Tokens.push_back(
                std::basic_string<char_t,char_allocator_t>(_Beg,_Ptr));

            _Beg = 1+_Ptr;
        }
    }

    _Tokens.push_back(
        std::basic_string<char_t,_End));
}

,在测试并确定每个建议的候选者的时间之后，结果就是strtok显然是最快的。尽管我对测试的热爱令我感到惊讶，但值得探索其他选择。 [注意：代码被放在一起编辑，欢迎使用:)] 鉴于：

typedef struct sTokenDataC {
    char *time;
    char *symb; 
    float bid;
    float ask;
    int bidSize;
    int askSize;
} tokenDataC;

tokenDataC parseTick( char *line,char *parseBuffer )
{
    tokenDataC tokenDataOut;

    tokenDataOut.time = strtok_s( line,&parseBuffer );
    tokenDataOut.symb = strtok_s( null,&parseBuffer );
    tokenDataOut.bid = atof(strtok_s( null,&parseBuffer ));
    tokenDataOut.ask = atof(strtok_s( null,&parseBuffer ));
    tokenDataOut.bidSize = atoi(strtok_s( null,&parseBuffer ));
    tokenDataOut.askSize = atoi(strtok_s( null,&parseBuffer  ));

    return tokenDataOut; 
}

char *testStringC = new char[64];
    strcpy(testStringC,14\");

int _tmain(int argc,_TCHAR* argv[])
{
char *lineBuffer = new char[64];
    printf(\"Testing method2....\\n\");
    for(int i = 0; i < ITERATIONS; i++)
    {
        strcpy(lineBuffer,testStringC);//this is more realistic since this has to happen because I want to preserve the line
        tokenData2 = parseTick(lineBuffer,parseBuffer);

    }
}

与通过以下方式致电John Diblings暗示：

    struct sTokenDataCPP
    {
        std::basic_string<char> time;
        std::basic_string<char> symb; 
        float bid;
        float ask;
        int bidSize;
        int askSize;
    };
        std::vector<myToken> tokens1;
            tokenDataCPP tokenData;
            printf(\"Testing method1....\\n\");
            for(int i = 0; i < ITERATIONS; i++)
            {
tokens1.clear();
                tokenize(raw,std::back_inserter(tokens1));

                tokenData.time.assign(tokens1.at(0).token_);
                tokenData.symb.assign(tokens1.at(1).token_);
                tokenData.ask = atof(tokens1.at(2).token_.c_str());
                tokenData.bid = atof(tokens1.at(3).token_.c_str());
                tokenData.askSize = atoi(tokens1.at(4).token_.c_str());
                tokenData.bidSize = atoi(tokens1.at(5).token_.c_str());

            }

与定义语法的简单boost.spirit.qi实现如下：

template <typename Iterator>
struct tick_parser : grammar<Iterator,tokenDataCPP(),boost::spirit::ascii::space_type>
{

    tick_parser() : tick_parser::base_type(start)
    {
        my_string %= lexeme[+(boost::spirit::ascii::char_ ) ];

        start %=
            my_string >> \',\'
            >>  my_string >> \',\'
            >>  float_ >> \',\'
            >>  int_ >> \',\'
            >>  int_
            ;
    }

    rule<Iterator,std::string(),boost::spirit::ascii::space_type> my_string;
    rule<Iterator,sTokenDataCPP(),boost::spirit::ascii::space_type> start;
};

ITERATIONS设置为500k： strtok版本：2s 约翰的版本：115秒提升：172秒我可以发布完整的代码是人们想要的，我只是不想占用很大的空间