微信公众号搜"智元新知"关注
微信扫一扫可直接关注哦!

一个完备的微型正则表达式【源码实现】

说明:刚才发现在处理*元字符时弄错了,代码修改重新上传到CSDN了,文章中的示例代码也进行了修改

前一版本有错误代码中将*处理成了前一字符至少出现1次,修改后为出现0次或多次。

如果你是通过CSDN下载找到这个页面的,请务必留意,你下载的可能不是最终版的代码。最终版代码下载地址:

http://download.csdn.net/detail/sun2043430/5333836


看了《代码之美》第一章的《正则表达式》之后一直手痒,想自己写一个。趁着周末有空试验了一把。


首先是挑选正则表达式的元字符集,我选择了以下这些元字符:

// c 匹配任意的字母(大小写均可)
// . 任意字符
// * 重复前一字符0次或多次,但尽量多次(贪婪匹配)
// ^ 匹配串的开头
// $ 匹配串的结尾
// \ 转义字符(\c表示一个字母'c',\\表示一个字母'\')

在《代码之美》里面缺少转义字符,但是在一个完备的正则表达式中,转义字符是必不可少的(否则会有不能表示的字符),所以我在代码实现中加入了转义字符。

另外《代码之美》一书中的代码对*元字符采用的是简单的懒惰匹配,而在正规的正则表达式中,*、+、?都是采用的贪婪匹配的。从代码实现上来说贪婪匹配稍难于懒惰匹配。


另外,《代码之美》一书中也提到了如果涉及到转义字符、中括号表示的字符集时,最好用结构体的方式来表示正则表达式中的一个字符。


我的源代码中在进行正则表达式匹配之前,先对正则表达式的正确性做验证,看正则表达式是否正确。罗列的标准如下:

  1. 元字符*不能在开头,元字符^只能出现在表达式开头,元字符$只能出现在表达式末尾。
  2. 转义字符\后面只能跟元字符
  3. 元字符^后面不能是元字符*
  4. 元字符*后面不能是元字符*
一些正则表达式是否正确的测试样例:
"^^Now err"     is error regular expression!
"^*abcd"        is error regular expression!
"ababcd\"       is error regular expression!
"^^*bcd"        is error regular expression!
"a^*bcd"        is error regular expression!
"^ab$cd"        is error regular expression!
"a**abcd"       is error regular expression!
"*abcd"         is error regular expression!
"\a*bcd"        is error regular expression!
"Now ok"        is correct regular expression!
"\\a*bcd"       is correct regular expression!
"\**bcd"        is correct regular expression!
".*abcd"        is correct regular expression!
"^a*bcd"        is correct regular expression!
"^abcd$"        is correct regular expression!
".abc d"        is correct regular expression!
".*abcd"        is correct regular expression!
"\c*abcd"       is correct regular expression!
"\*abcd"        is correct regular expression!
"c*abcd"        is correct regular expression!
"\.c*abcd"      is correct regular expression!
"abc\.c*abcd"   is correct regular expression!
"abc  d"        is correct regular expression!
"\^*abcd"       is correct regular expression!

检测正则表达式是否正确的代码

bool CheckRegExpr(const char *pReg)
{
    const char *pBegin = pReg;
    if ('*' == *pBegin)
        return false;

    while (*pReg)
    {
        if ( ('^' == *pReg && pReg != pBegin) || 
            ('^' == *pReg && pReg[1] == '*') || 
            ('$' == *pReg && pReg[1] != '\0') )
            return false;

        if ('*' == *pReg && '*' == pReg[1])
            return false;

        if ('\\' == *pReg)
        {
            if (!IsRegMetacharacter(pReg[1]))
                return false;
            else
                pReg++;
        }

        pReg++;
    }
    return true;
}

正则匹配的核心代码

const char* RegExprFind(const char *pText,const char *pReg)
{
    const char *pCur = pText;
    if (!CheckRegExpr(pReg))
        return (char*)-1;

    do 
    {
        g_pBeg = pCur;
        MATCH_STATE eResult = Match(pCur,pReg);
        if (MATCH_OK == eResult)
            return pCur;
        else if (MATCH_ERROR == eResult)
            return (char*)-1;
        else if (MATCH_FAIL == eResult)
            return NULL;
    }
    while (*pCur++);
    return NULL;
}

MATCH_STATE Match(const char *pCur,const char *pReg)
{
    g_pEnd = pCur;
    if ('\0' == *pReg)
        return (g_pEnd != g_pBeg) ? MATCH_OK : MATCH_NEXT_POSITION;

    if ('$' == *pReg)
        return ('\0' == *pCur && g_pEnd != g_pBeg) ? MATCH_OK : MATCH_FAIL;

    if ('^' == *pReg)
    {
//        return Match(pCur,pReg+1);
        if (MATCH_OK == Match(pCur,pReg+1))
            return MATCH_OK;
        else
            return MATCH_FAIL;
    }

    st_RE_Element elementCur;// 更复杂的情况要借助结构体
    st_RE_Element elementNext;
    int nElementLen1 = 0;
    int nElementLen2 = 0;
    nElementLen1 = GetReElement(&elementCur,pReg);
    nElementLen2 = GetReElement(&elementNext,pReg+nElementLen1);

    if (!elementNext.isEscape && elementNext.ch == '*')// 贪婪匹配比较麻烦
    {
        const char *pStart = pCur;
        while(*pCur && MatchAt(elementCur,*pCur))
            pCur++;
        while (pCur >= pStart)
        {
            if (MATCH_OK == Match(pCur,pReg+nElementLen1+nElementLen2))
                return MATCH_OK;
            pCur--;
        }
    }
    else
    {
        if (MatchAt(elementCur,*pCur))
            return Match(pCur+1,pReg+nElementLen1);
    }

    return MATCH_NEXT_POSITION;
}

int GetReElement(pst_RE_Element pElement,const char *pReg)
{
    if (*pReg == '\\')
    {
        pElement->isEscape = true;
        pElement->ch = pReg[1];
        return 2;
    }
    else
    {
        pElement->isEscape = false;
        pElement->ch = *pReg;
        return 1;
    }
}

bool MatchAt(st_RE_Element regChar,char ch)
{
    if (regChar.isEscape) // \c \\ etc.
    {
        return ch == regChar.ch;
    }
    else // a . c
    {
        if ('.' == regChar.ch || ('c' == regChar.ch && IsAlpha(ch)) )
        {
            return true;
        }
        return ch == regChar.ch;
    }
}


正则表达式对应的结构体和枚举常量:

typedef struct _st_RE_Element
{
    char ch;
    bool isEscape;
}st_RE_Element,*pst_RE_Element;

enum MATCH_STATE
{
    MATCH_OK,MATCH_NEXT_POSITION,MATCH_FAIL,MATCH_ERROR,//regular expression Syntax error,like "a**b"
};

正则表达式匹配效果演示:

                                        0         1         2         3         4         5
                                        012345678901234567890123456789012345678901234567890
Text is                                 Let's ggggo abcdeccc.^$*\\\fg 223333332122222333322222
Find at 00      RegExpr  =  "Let"       Let
Find at 00      RegExpr  =  "^Let.*$"   Let's ggggo abcdeccc.^$*\\\fg 223333332122222333322222
Find at 00      RegExpr  =  "c*"        Let
Find at 06      RegExpr  =  "g*o*"      ggggo
Find at 06      RegExpr  =  "g*a*"      gggg
Find at 30      RegExpr  =  "k*2*3*k*"  22333333
Find at 30      RegExpr  =  "k*2*k*3*"  22333333
Find at 22      RegExpr  =  "\$"        $
Find at 20      RegExpr  =  "\."        .
Find at 24      RegExpr  =  "\\*"       \\\
Find at 20      RegExpr  =  "\..*23"    .^$*\\\fg 2233333321222223
Find at 00      RegExpr  =  "c*"        Let
Find at 14      RegExpr  =  "\c*"       c
Find at 49      RegExpr  =  "2*$"       22222
Find at 32      RegExpr  =  "3*"        333333
Find at 30      RegExpr  =  "2*223"     223
Find at 30      RegExpr  =  "2*23"      223
Find at 06      RegExpr  =  "g*o.*3*21" ggggo abcdeccc.^$*\\\fg 2233333321
Find at 06      RegExpr  =  "g*o.*3*2"  ggggo abcdeccc.^$*\\\fg 223333332122222333322222
Find at 06      RegExpr  =  "g*o.*3*"   ggggo abcdeccc.^$*\\\fg 223333332122222333322222
Find at 06      RegExpr  =  "g*o.*3"    ggggo abcdeccc.^$*\\\fg 2233333321222223333
Not Found!      RegExpr  =  "Below are Not Found!------------------------------------------"
Not Found!      RegExpr  =  "k*"
Not Found!      RegExpr  =  "g*o.2*3"
Find at 54      RegExpr  =  "3*$"
Not Found!      RegExpr  =  "^3*"
Not Found!      RegExpr  =  "Below are Syntax Error!---------------------------------------"
Syntax error!   RegExpr  =  "^*abc"
Syntax error!   RegExpr  =  ".*a$bc"

完整代码可到以下地址下载:

http://download.csdn.net/detail/sun2043430/5333836

注:代码中使用函数式实现,为了便于打印输出、展示结果,使用了全局变量

原文地址:https://www.jb51.cc/regex/362949.html

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。

相关推荐