Java用正则表达式如何读取网页内容
学习java的正则表达式,抓取网页并解析HTML部分内容
packagecom.xiaofeng.picup; importjava.io.BufferedReader; importjava.io.IOException; importjava.io.InputStreamReader; importjava.net.MalformedURLException; importjava.net.URL; importjava.util.ArrayList; importjava.util.HashMap; importjava.util.List; importjava.util.regex.Matcher; importjava.util.regex.Pattern; /***//** * *@抓取页面文章标题及内容(测试)手动输入网址抓取,可进一步自动抓取整个页面的全部内容 * */ publicclassWebContent...{ /***//** *读取一个网页全部内容 */ publicStringgetOneHtml(Stringhtmlurl)throwsIOException...{ URLurl; Stringtemp; StringBuffersb=newStringBuffer(); try...{ url=newURL(htmlurl); BufferedReaderin=newBufferedReader(newInputStreamReader(url .openStream(),"utf-8"));//读取网页全部内容 while((temp=in.readLine())!=null)...{ sb.append(temp); } in.close(); }catch(MalformedURLExceptionme)...{ System.out.println("你输入的URL格式有问题!请仔细输入"); me.getMessage(); throwme; }catch(IOExceptione)...{ e.printStackTrace(); throwe; } returnsb.toString(); } /***//** * *@params *@return获得网页标题 */ publicStringgetTitle(Strings)...{ Stringregex; Stringtitle=""; List<String>list=newArrayList<String>(); regex="<title>.*?</title>"; Patternpa=Pattern.compile(regex,Pattern.CANON_EQ); Matcherma=pa.matcher(s); while(ma.find())...{ list.add(ma.group()); } for(inti=0;i<list.size();i++)...{ title=title+list.get(i); } returnoutTag(title); } /***//** * *@params *@return获得链接 */ publicList<String>getLink(Strings)...{ Stringregex; List<String>list=newArrayList<String>(); regex="<a[^>]*href=("([^"]*)"|'([^']*)'|([^s>]*))[^>]*>(.*?)</a>"; Patternpa=Pattern.compile(regex,Pattern.DOTALL); Matcherma=pa.matcher(s); while(ma.find())...{ list.add(ma.group()); } returnlist; } /***//** * *@params *@return获得脚本代码 */ publicList<String>getScript(Strings)...{ Stringregex; List<String>list=newArrayList<String>(); regex="<script.*?</script>"; Patternpa=Pattern.compile(regex,Pattern.DOTALL); Matcherma=pa.matcher(s); while(ma.find())...{ list.add(ma.group()); } returnlist; } /***//** * *@params *@return获得CSS */ publicList<String>getCSS(Strings)...{ Stringregex; List<String>list=newArrayList<String>(); regex="<style.*?</style>"; Patternpa=Pattern.compile(regex,Pattern.DOTALL); Matcherma=pa.matcher(s); while(ma.find())...{ list.add(ma.group()); } returnlist; } /***//** * *@params *@return去掉标记 */ publicStringoutTag(Strings)...{ returns.replaceAll("<.*?>",""); }