代码如下:
1
/***/
/**Theregexforsearchlinkwiththetag"a"*/
2 private finalStringA_REGEX="<a.*?/a>";
3 /***/ /**Theregexforsearchurlwiththetag"href"*/
4 private finalStringHREF_REGEX="href=\".*?\"";
5 /***/ /**Thepatternforlinkewiththetag"a"*/
6 private finalPatternA_PATTERN=Pattern.compile(A_REGEX);
7 /***/ /**Thepatternforurlwiththetag"href"*/
8 private finalPatternHREF_PATTERN=Pattern.compile(HREF_REGEX);
9 /***/ /**
10*Geturladdressfromtheurlandthecontentoftheurl
11*@paramurltheurlneedtobegetlinks
12*@paramcontentthecontentofthegivenurl
13*@returnalistwiththeurladdressofthelinks
14*/
15 publicList<String>getLinkList(URLurl,Stringcontent)
16 {
17List<String>linkList=newLinkedList<String>();
18finalMatchera_matcher=A_PATTERN.matcher(content);
19while(a_matcher.find())
20{
21//JUSTFORTEST!
22//System.out.println(a_matcher.group());
23//geturladdress
24finalMatchermyurl=HREF_PATTERN.matcher(a_matcher.group());
25while(myurl.find())
26{
27StringurlAddress=myurl.group().replaceAll("href=|>|\"|\"","");
28if(urlAddress.startsWith("http"))
29{
30linkList.add(urlAddress);
31}
32elseif(urlAddress.startsWith("/")||urlAddress.startsWith("\\"))
33{
34linkList.add(url.getPath()+urlAddress);
35}
36else
37{
38StringfullUrl=url.toString();
39//thelengthoftheurlwithoutthecurrentpage
40intlastSlash=fullUrl.lastIndexOf("/")+1;
41linkList.add(fullUrl.substring(0,lastSlash)+urlAddress);
42}
43}
44}
45returnlinkList;
46}
2 private finalStringA_REGEX="<a.*?/a>";
3 /***/ /**Theregexforsearchurlwiththetag"href"*/
4 private finalStringHREF_REGEX="href=\".*?\"";
5 /***/ /**Thepatternforlinkewiththetag"a"*/
6 private finalPatternA_PATTERN=Pattern.compile(A_REGEX);
7 /***/ /**Thepatternforurlwiththetag"href"*/
8 private finalPatternHREF_PATTERN=Pattern.compile(HREF_REGEX);
9 /***/ /**
10*Geturladdressfromtheurlandthecontentoftheurl
11*@paramurltheurlneedtobegetlinks
12*@paramcontentthecontentofthegivenurl
13*@returnalistwiththeurladdressofthelinks
14*/
15 publicList<String>getLinkList(URLurl,Stringcontent)
16 {
17List<String>linkList=newLinkedList<String>();
18finalMatchera_matcher=A_PATTERN.matcher(content);
19while(a_matcher.find())
20{
21//JUSTFORTEST!
22//System.out.println(a_matcher.group());
23//geturladdress
24finalMatchermyurl=HREF_PATTERN.matcher(a_matcher.group());
25while(myurl.find())
26{
27StringurlAddress=myurl.group().replaceAll("href=|>|\"|\"","");
28if(urlAddress.startsWith("http"))
29{
30linkList.add(urlAddress);
31}
32elseif(urlAddress.startsWith("/")||urlAddress.startsWith("\\"))
33{
34linkList.add(url.getPath()+urlAddress);
35}
36else
37{
38StringfullUrl=url.toString();
39//thelengthoftheurlwithoutthecurrentpage
40intlastSlash=fullUrl.lastIndexOf("/")+1;
41linkList.add(fullUrl.substring(0,lastSlash)+urlAddress);
42}
43}
44}
45returnlinkList;
46}
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。