详见:http://www2.sas.com/proceedings/sugi29/265-29.pdf
***Primary functions: PRXPARSE,PRXMATCH; /*Program 1: Using a Perl regular expression to locate lines with an exact text match*/ DATA _NULL_; TITLE "Perl Regular Expression Tutorial – Program 1"; IF _N_ = 1 THEN PATTERN_NUM = PRXPARSE("/cat/");/*prxparse:定义正则表达*/ RETAIN PATTERN_NUM; INPUT STRING $30.; POSITION = PRXMATCH(PATTERN_NUM,STRING);/*prxmatch:定义文本形式*/ FILE PRINT; PUT PATTERN_NUM= STRING= POSITION=; DATALInes; There is a cat in this line. Does not match CAT cat in the beginning At the end,a cat cat ; /*Program 2: Using a regular expression to search for phone numbers in a string*/ ***Primary functions: PRXPARSE,PRXMATCH; DATA PHONE; IF _N_ = 1 THEN PATTERN = PRXPARSE("/\(\d\d\d\) ?\d\d\d-\d{4}/"); ***Regular expression will match any phone number in the form: (nnn)nnn-nnnn or (nnn) nnn-nnnn.; /* \( matches a left parenthesis \d\d\d matches any three digits (blank)? matches zero or one blank \d\d\d matches any three digits - matches a dash \d{4} matches any four digits */ RETAIN PATTERN; INPUT STRING $CHAR40.; IF PRXMATCH(PATTERN,STRING) GT 0 THEN OUTPUT; DATALInes; One number (123)333-4444 Two here:(800)234-2222 and (908) 444-2344 None here ; PROC PRINT DATA=PHONE NOOBS; TITLE "Listing of Data Set Phone"; RUN; /*Program 3: Modifying Program 2 to search for toll-free phone numbers*/ ***Primary functions: PRXPARSE,PRXMATCH ***Other function: MISSING; DATA TOLL_FREE; IF _N_ = 1 THEN DO RE = PRXPARSE("/\(8(00|77|87)\) ?\d\d\d-\d{4}\b/"); ***Regular expression looks for phone numbers of the form: (nnn)nnn-nnnn or (nnn) nnn-nnnn. In addition the first digit of the area code must be an 8 and the next two digits must be either a 00,77,or 87.; IF MISSING(RE) THEN DO; PUT "ERROR IN COMPILING REGULAR EXPRESSION"; STOP; END; END; RETAIN RE; INPUT STRING $CHAR80.; POSITION = PRXMATCH(RE,STRING); IF POSITION GT 0 THEN OUTPUT; DATALInes; One number on this line (877)234-8765 No numbers here One toll free,one not:(908)782-6354 and (800)876-3333 xxx Two toll free:(800)282-3454 and (887) 858-1234 No toll free here (609)848-9999 and (908) 345-2222 ; PROC PRINT DATA=TOLL_FREE NOOBS; TITLE "Listing of Data Set TOLL_FREE"; RUN; /*Program 4: Using PRXMATCH without PRXPARSE (entering the regular expression directly in the function)*/ ***Primary functions: PRXMATCH; DATA MATCH_IT; INPUT @1 STRING $20.; POSITION = PRXMATCH("/\d\d\d/",STRING); DATALInes; LINE 345 IS HERE NONE HERE ABC1234567 ; PROC PRINT DATA=MATCH_IT NOOBS; TITLE "Listing of Data Set MATCH_IT"; RUN; /*Program 5: Locating all 5- or 9-digit zip codes in a list of addresses*/ ***Primary functions: PRXPARSE and PRXSUBSTR ***Other functions: SUBSTRN; DATA ZIPCODE; IF _N_ = 1 THEN RE = PRXPARSE("/ \d{5}(-\d{4})?/"); RETAIN RE; /* Match a blank followed by 5 digits followed by either nothing or a dash and 4 digits \d{5} matches 5 digits - matches a dash \d{4} matches 4 digits ? matches zero of one of the preceding subexpression */ INPUT STRING $80.; LENGTH ZIP_CODE $ 10; CALL PRXSUBSTR(RE,STRING,START,LENGTH); IF START GT 0 THEN DO; ZIP_CODE = SUBSTRN(STRING,START + 1,LENGTH - 1); OUTPUT; END; KEEP ZIP_CODE; DATALInes; John Smith 12 broad Street Flemington,NJ 08822 Philip Judson Apt #1,Building 7 777 Route 730 Kerrville,TX 78028 Dr. Roger Alan 44 Commonwealth Ave. Boston,MA 02116-7364 ; PROC PRINT DATA=ZIPCODE NOOBS; TITLE "Listing of Data Set ZIPCODE"; RUN; /*Program 6: Extracting a phone number from a text string*/ ***Primary functions: PRXPARSE,PRXSUBSTR ***Other functions: SUBSTR,COMPRESS,and MISSING; DATA EXTRACT; IF _N_ = 1 THEN DO; PATTERN = PRXPARSE("/\(\d\d\d\) ?\d\d\d-\d{4}/"); IF MISSING(PATTERN) THEN DO; PUT "ERROR IN COMPILING REGULAR EXPRESSION"; STOP; END; END; RETAIN PATTERN; LENGTH NUMBER $ 15; INPUT STRING $CHAR80.; CALL PRXSUBSTR(PATTERN,LENGTH); IF START GT 0 THEN DO; NUMBER = SUBSTRTRING,LENGTH); (S NUMBER = COMPRESS(NUMBER," "); OUTPUT; END; KEEP NUMBER; DATALInes; THIS LINE DOES NOT HAVE ANY PHONE NUMBERS ON IT THIS LINE DOES: (123)345-4567 LA DI LA DI LA ALSO VALID (123) 999-9999 TWO NUMBERS HERE (333)444-5555 AND (800)123-4567 ; PROC PRINT DATA=EXTRACT NOOBS; TITLE "Extracted Phone Numbers"; RUN; /*Program 7: Using the PRXPOSN function to extract the area code and exchange from a phone number*/ ***Primary functions: PRXPARSE,PRXMATCH,PRXPOSN ***Other functions: SUBSTR; RUN; DATA PIECES; IF _N_ THEN RE = PRXPARSE("/\((\d\d\d)\) ?(\d\d\d)-\d{4}/"); /* \( matches an open parenthesis \d\d\d matches three digits \) matches a closed parenthesis b? matches zero or more blanks (b = blank) \d\d\d matches three digits - matches a dash \d{4} matches four digits */ RETAIN RE; INPUT NUMBER $CHAR80.; MATCH = PRXMATCH(RE,NUMBER); IF MATCH GT 0 THEN DO; CALL PRXPOSN(RE,1,AREA_START); CALL PRXPOSN(RE,2,EX_START,EX_LENGTH); AREA_CODE = SUBSTR(NUMBER,AREA_START,3); EXCHANGE = SUBSTR(NUMBER,EX_LENGTH); END; DROP RE; DATALInes; THIS LINE DOES NOT HAVE ANY PHONE NUMBERS ON IT THIS LINE DOES: (123)345-4567 LA DI LA DI LA ALSO VALID (609) 999-9999 TWO NUMBERS HERE (333)444-5555 AND (800)123-4567 ; PROC PRINT DATA=PIECES NOOBS heading=H; TITLE "Listing of Data Set PIECES"; RUN; /*Program 8: Using regular expressions to read very unstructured data*/ ***Primary functions: PRSPARSE,PRXPOSN ***Other functions: SUBSTR,INPUT; ***This program will read every line of data and,for any line that contains two or more numbers,will assign the first number to X and the second number to Y; DATA READ_NUM; ***Read the first number and second numbers on line; IF _N_ = 1 THEN RET = PRXPARSE("/(\d+) +\D*(\d+)/"); /* \d+ matches one or more digits b+ matches one or more blanks (b = blank) \D* matches zero or more non-digits \d+ matches one or more digits */ RETAIN RET; INPUT STRING $CHAR40.; POS = PRXMATCH(RET,STRING); IF POS GT 0 THEN DO; CALL PRXPOSN(RET,START1,LENGTH1); IF START1 GT 0 THEN X = INPUT(SUBSTR(STRING,LENGTH1),9.); CALL PRXPOSN(RET,START2,LENGTH2); IF START2 GT 0 THEN Y = INPUT(SUBSTR(STRING,LENGTH2),9.); OUTPUT; END; KEEP STRING X Y; DATALInes; XXXXXXXXXXXXXXXXXX 9 XXXXXXX 123 This line has a 6 and a 123 in it 456 789 None on this line Only one here: 77 ; PROC PRINT DATA=READ_NUM NOOBS; TITLE "Listing of Data Set READ_NUM"; RUN; /*Program 9: Finding digits in random positions in an input string using CALL PRXNEXT*/ ***Primary functions: PRXPARSE,PRXNEXT; DATA FIND_NUM; IF _N_ = 1 THEN RET = PRXPARSE("/\d+/"); *Look for one or more digits in a row; RETAIN RET; INPUT STRING $40.; START = 1; STOP = LENGTH(STRING); CALL PRXNEXT(RET,STOP,POSITION,LENGTH); ARRAY X[5]; DO I = 1 TO 5 WHILE (POSITION GT 0); X[I] = INPUT(SUBSTR(STRING,LENGTH),9.); CALL PRXNEXT(RET,LENGTH); END; KEEP X1-X5 STRING; DATALInes; THIS 45 LINE 98 HAS 3 NUMBERS NONE HERE 12 34 78 90 ; PROC PRINT DATA=FIND_NUM NOOBS; TITLE "Listing of Data Set FIND_NUM"; RUN; /*Program 10: Demonstrating the PRXPAREN function*/ ***Primary functions: PRXPARSE,PRXPAREN; DATA PAREN; IF _N_ = 1 THEN PATTERN = PRXPARSE("/(\d )|(\d\d )|(\d\d\d )/"); ***One or two or three digit number followed by a blank; RETAIN PATTERN; INPUT STRING $CHAR30.; POSITION = PRXMATCH(PATTERN,STRING); IF POSITION GT 0 THEN WHICH_PAREN = PRXPAREN(PATTERN); DATALInes; one single digit 8 here two 888 77 12345 1234 123 12 1 ; PROC PRINT DATA=PAREN NOOBS; TITLE "Listing of Data Set PAREN"; RUN; /*Program 11: Demonstrating the PRXCHANGE function*/ ***Primary functions: PRXPARSE,PRXCHANGE; DATA CAT_AND_MOUSE; INPUT TEXT $CHAR40.; LENGTH NEW_TEXT $ 80; IF _N_ = 1 THEN MATCH = PRXPARSE("s/[Cc]at/Mouse/"); *Replace "Cat" or "cat" with Mouse; RETAIN MATCH; CALL PRXCHANGE(MATCH,-1,TEXT,NEW_TEXT,R_LENGTH,Trunc,N_OF_CHANGES); IF Trunc THEN PUT "Note: NEW_TEXT was truncated"; DATALInes; The Cat in the hat There are two cat cats in this line ; PROC PRINT DATA=CAT_AND_MOUSE NOOBS; TITLE "Listing of CAT_AND_MOUSE"; RUN; /*Program 12: Demonstrating the use of capture buffers with PRXCHANGE*/ ***Primary functions: PRXPARSE,PRXCHANGE; DATA CAPTURE; IF _N_ = 1 THEN RETURN = PRXPARSE("S/(\w+ +)(\w+)/$2 $1/"); RETAIN RETURN; INPUT STRING $20.; CALL PRXCHANGE(RETURN,STRING); DATALInes; Ron Cody Russell Lynn ; PROC PRINT DATA=CAPTURE NOOBS; TITLE "Listing of Data Set CAPTURE"); RUN;
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。