springBoot+webMagic实现网站爬虫的实例代码
前端时间公司项目需要抓取各类数据,py玩的不6,只好研究Java爬虫方案,做一个总结。
开发环境:
springBoot2.2.6、jdk1.8。
1、导入依赖
us.codecraft webmagic-core 0.7.3 us.codecraft webmagic-extension 0.7.3 com.google.guava guava 16.0
话不多说,直接上代码。
基础案例
下面代码说明以一个类似列表的页面为例
packagecom.crawler.project.proTask; importcom.alibaba.fastjson.JSONObject; importorg.springframework.scheduling.annotation.Scheduled; importus.codecraft.webmagic.Page; importus.codecraft.webmagic.Site; importus.codecraft.webmagic.Spider; importus.codecraft.webmagic.processor.PageProcessor; importus.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover; importus.codecraft.webmagic.scheduler.QueueScheduler; importus.codecraft.webmagic.selector.Selectable; importjava.util.List; publicclassTaskProcessorimplementsPageProcessor{ /* *此方法为爬虫业务实现 **/ @Override publicvoidprocess(Pagepage){ //1、爬虫任务获取到一个page解析page上的列表 Listlist=page.getHtml().css("cssselector").nodes(); if(list.size()>0){//说明为列表页面、需要解析列表中每个元素的链接,存入待获取page队列中 for(Selectableselectable:list){ //遍历集合,将每个元素链接存入待获取page队列中 page.addTargetRequest(selectable.links().toString()); } //同时将下一页的url存入队列中 page.addTargetRequest("下一页的url"); }else{ //此时为列表中单个元素对应的详情页 //在自定义方法中处理详细页,获取需要的数据进行处理。 handle(page); } } privatevoidhandle(Pagepage){ //例如处理后的数据为一个JSONObject对象 JSONObjecttmp=newJSONObject(); //将这个tmp交由自定义的TaskPipline类处理,若未自定义Pipline并设置到Spider参数中,框架会默认将tmp打印到控制台。 page.putField("obj",tmp); } /* *此方法为配置爬虫过程的一些参数 **/ privateSitesite=Site.me() .setCharset("UTF-8") .setTimeOut(60*1000) .setRetrySleepTime(60*1000) .setCycleRetryTimes(5); @Override publicSitegetSite(){ returnsite; } /* 设置定时任务,执行爬虫任务 **/ @Scheduled(initialDelay=1*1000,fixedDelay=2*1000) publicvoidprocess(){ System.out.println("开始执行爬虫抓取任务"); Spider.create(newTaskProcessor())//注意这里的类名要和当前类名对应 .addUrl("起始页url") .addPipeline(newTaskPipeline())//此处课自定义数据处理类(在handle()方法中有); .setScheduler(newQueueScheduler().setDuplicateRemover(newBloomFilterDuplicateRemover(100000))) .thread(3)//此处设置线程数量(不宜过多,最好和列表页中列表元素数量一致) .run(); } }
packagecom.crawler.project.proTask; importcom.alibaba.fastjson.JSON; importcom.alibaba.fastjson.JSONObject; importus.codecraft.webmagic.ResultItems; importus.codecraft.webmagic.Task; importus.codecraft.webmagic.pipeline.Pipeline; publicclassTaskPipelineimplementsPipeline{ @Override publicvoidprocess(ResultItemsresultItems,Tasktask){ if(resultItems.getAll().size()>0){ Objectobj=resultItems.getAll().get("obj"); JSONObjectjsonObject=JSON.parseObject(obj.toString()); //获取到JSONObject对象下面可进行自定义的业务处理。 } } }
特殊情况一
需根据链接下载图片或文件
eg:在上面说到的详情页中含有iframe。
1、首先获取iframe的src
//获得iframe的src(这里要注意获得的src是绝对路径还是相对路径,相对路径需要拼接主站点url) Stringsrc=html.css("cssselector","src").toString(); //采用jsoup解析 Documentdocument=Jsoup.parse(newURL(src),1000); //获得需要的元素 Elementele=document.select("cssselector").last(); //获取需要下载的文件的链接 StringdownUrl=ele.attr("href"); //根据链接下载文件返回一个文件的名称 StringfileName=downloadFile(downUrl);
//通过url下载文件 publicStringdownloadFile(StringfileUrl)throwsFileNotFoundException{ try{ URLhttpUrl=newURL(fileUrl); StringfileName=UUID.randomUUID().toString()+".mp3"; Filefile=newFile(this.STATIC_FILEPATH+fileName); System.out.println("============保存文件方法被调用==============="); FileUtils.copyURLToFile(httpUrl,file); returnfileName; }catch(Exceptione){ e.printStackTrace(); returnnull; } }
特殊情况二
有些https站点无法直接使用WebMagic默认的下载器下载,此时我们可以根据站点ssl类型修改下载器。
在项目中创建一个包用于存放自定义(修改)的下载器类
(!!!摘自webMagic框架中HttpClientDownloader,基于此类修改!!!)
/* 此方法中需要传入一个自定义的生成器(HttpClientGenerator) */ packagecom.crawler.project.spider_download; importorg.apache.commons.io.IOUtils; importorg.apache.http.HttpResponse; importorg.apache.http.client.methods.CloseableHttpResponse; importorg.apache.http.impl.client.CloseableHttpClient; importorg.apache.http.util.EntityUtils; importorg.slf4j.Logger; importorg.slf4j.LoggerFactory; importus.codecraft.webmagic.Page; importus.codecraft.webmagic.Request; importus.codecraft.webmagic.Site; importus.codecraft.webmagic.Task; importus.codecraft.webmagic.downloader.AbstractDownloader; importus.codecraft.webmagic.downloader.HttpClientRequestContext; importus.codecraft.webmagic.downloader.HttpUriRequestConverter; importus.codecraft.webmagic.proxy.Proxy; importus.codecraft.webmagic.proxy.ProxyProvider; importus.codecraft.webmagic.selector.PlainText; importus.codecraft.webmagic.utils.CharsetUtils; importus.codecraft.webmagic.utils.HttpClientUtils; importjava.io.IOException; importjava.nio.charset.Charset; importjava.util.HashMap; importjava.util.Map; /** *ThehttpdownloaderbasedonHttpClient. * *@authorcode4crafter@gmail.com
*@since0.1.0 */ publicclassHttpClientDownloaderextendsAbstractDownloader{ privateLoggerlogger=LoggerFactory.getLogger(getClass()); privatefinalMaphttpClients=newHashMap (); //自定义的生成器(HttpClientGenerator)注意导入的应为自定义的HttpClientGenerator类,而不是WebMagic依赖中的HttpClientGenerator类。 privateHttpClientGeneratorhttpClientGenerator=newHttpClientGenerator(); privateHttpUriRequestConverterhttpUriRequestConverter=newHttpUriRequestConverter(); privateProxyProviderproxyProvider; privatebooleanresponseHeader=true; publicvoidsetHttpUriRequestConverter(HttpUriRequestConverterhttpUriRequestConverter){ this.httpUriRequestConverter=httpUriRequestConverter; } publicvoidsetProxyProvider(ProxyProviderproxyProvider){ this.proxyProvider=proxyProvider; } privateCloseableHttpClientgetHttpClient(Sitesite){ if(site==null){ returnhttpClientGenerator.getClient(null); } Stringdomain=site.getDomain(); CloseableHttpClienthttpClient=httpClients.get(domain); if(httpClient==null){ synchronized(this){ httpClient=httpClients.get(domain); if(httpClient==null){ httpClient=httpClientGenerator.getClient(site); httpClients.put(domain,httpClient); } } } returnhttpClient; } @Override publicPagedownload(Requestrequest,Tasktask){ if(task==null||task.getSite()==null){ thrownewNullPointerException("taskorsitecannotbenull"); } CloseableHttpResponsehttpResponse=null; CloseableHttpClienthttpClient=getHttpClient(task.getSite()); Proxyproxy=proxyProvider!=null?proxyProvider.getProxy(task):null; HttpClientRequestContextrequestContext=httpUriRequestConverter.convert(request,task.getSite(),proxy); Pagepage=Page.fail(); try{ httpResponse=httpClient.execute(requestContext.getHttpUriRequest(),requestContext.getHttpClientContext()); page=handleResponse(request,request.getCharset()!=null?request.getCharset():task.getSite().getCharset(),httpResponse,task); onSuccess(request); logger.info("downloadingpagesuccess{}",request.getUrl()); returnpage; }catch(IOExceptione){ logger.warn("downloadpage{}error",request.getUrl(),e); onError(request); returnpage; }finally{ if(httpResponse!=null){ //ensuretheconnectionisreleasedbacktopool EntityUtils.consumeQuietly(httpResponse.getEntity()); } if(proxyProvider!=null&&proxy!=null){ proxyProvider.returnProxy(proxy,page,task); } } } @Override publicvoidsetThread(intthread){ httpClientGenerator.setPoolSize(thread); } protectedPagehandleResponse(Requestrequest,Stringcharset,HttpResponsehttpResponse,Tasktask)throwsIOException{ byte[]bytes=IOUtils.toByteArray(httpResponse.getEntity().getContent()); StringcontentType=httpResponse.getEntity().getContentType()==null?"":httpResponse.getEntity().getContentType().getValue(); Pagepage=newPage(); page.setBytes(bytes); if(!request.isBinaryContent()){ if(charset==null){ charset=getHtmlCharset(contentType,bytes); } page.setCharset(charset); page.setRawText(newString(bytes,charset)); } page.setUrl(newPlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); page.setDownloadSuccess(true); if(responseHeader){ page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); } returnpage; } privateStringgetHtmlCharset(StringcontentType,byte[]contentBytes)throwsIOException{ Stringcharset=CharsetUtils.detectCharset(contentType,contentBytes); if(charset==null){ charset=Charset.defaultCharset().name(); logger.warn("Charsetautodetectfailed,use{}ascharset.PleasespecifycharsetinSite.setCharset()",Charset.defaultCharset()); } returncharset; } }
然后在自定义的HttpClientGenerator类中修改有关ssl的参数
(!!!摘自webMagic框架中HttpClientGenerator,基于此类修改!!!)
/* 自定义的HttpClientGenerator生成器 */ packagecom.sealion_crawler.project.spider_download; importorg.apache.http.HttpException; importorg.apache.http.HttpRequest; importorg.apache.http.HttpRequestInterceptor; importorg.apache.http.client.CookieStore; importorg.apache.http.config.Registry; importorg.apache.http.config.RegistryBuilder; importorg.apache.http.config.SocketConfig; importorg.apache.http.conn.socket.ConnectionSocketFactory; importorg.apache.http.conn.socket.PlainConnectionSocketFactory; importorg.apache.http.conn.ssl.DefaultHostnameVerifier; importorg.apache.http.conn.ssl.SSLConnectionSocketFactory; importorg.apache.http.impl.client.*; importorg.apache.http.impl.conn.PoolingHttpClientConnectionManager; importorg.apache.http.impl.cookie.BasicClientCookie; importorg.apache.http.protocol.HttpContext; importorg.slf4j.Logger; importorg.slf4j.LoggerFactory; importus.codecraft.webmagic.Site; importus.codecraft.webmagic.downloader.CustomRedirectStrategy; importjavax.net.ssl.SSLContext; importjavax.net.ssl.TrustManager; importjavax.net.ssl.X509TrustManager; importjava.io.IOException; importjava.security.KeyManagementException; importjava.security.NoSuchAlgorithmException; importjava.security.cert.CertificateException; importjava.security.cert.X509Certificate; importjava.util.Map; /** *@authorcode4crafter@gmail.com
*@since0.4.0 */ publicclassHttpClientGenerator{ privatetransientLoggerlogger=LoggerFactory.getLogger(getClass()); privatePoolingHttpClientConnectionManagerconnectionManager; publicHttpClientGenerator(){ Registryreg=RegistryBuilder. create() .register("http",PlainConnectionSocketFactory.INSTANCE) .register("https",buildSSLConnectionSocketFactory()) .build(); connectionManager=newPoolingHttpClientConnectionManager(reg); connectionManager.setDefaultMaxPerRoute(100); } /* 此方法中设置ssl有关参数。 */ privateSSLConnectionSocketFactorybuildSSLConnectionSocketFactory(){ try{ returnnewSSLConnectionSocketFactory(createIgnoreVerifySSL(),newString[]{"SSLv3","TLSv1","TLSv1.1","TLSv1.2"}, null, newDefaultHostnameVerifier());//优先绕过安全证书 }catch(KeyManagementExceptione){ logger.error("sslconnectionfail",e); }catch(NoSuchAlgorithmExceptione){ logger.error("sslconnectionfail",e); } returnSSLConnectionSocketFactory.getSocketFactory(); } privateSSLContextcreateIgnoreVerifySSL()throwsNoSuchAlgorithmException,KeyManagementException{ //实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 X509TrustManagertrustManager=newX509TrustManager(){ @Override publicvoidcheckClientTrusted(X509Certificate[]chain,StringauthType)throwsCertificateException{ } @Override publicvoidcheckServerTrusted(X509Certificate[]chain,StringauthType)throwsCertificateException{ } @Override publicX509Certificate[]getAcceptedIssuers(){ returnnull; } }; /* 下面为当前框架默认参数 SSLContextsc=SSLContext.getInstance("SSLv3"); 可修改为需要的ssl参数类型 */ SSLContextsc=SSLContext.getInstance("TLS"); sc.init(null,newTrustManager[]{trustManager},null); returnsc; } publicHttpClientGeneratorsetPoolSize(intpoolSize){ connectionManager.setMaxTotal(poolSize); returnthis; } publicCloseableHttpClientgetClient(Sitesite){ returngenerateClient(site); } privateCloseableHttpClientgenerateClient(Sitesite){ HttpClientBuilderhttpClientBuilder=HttpClients.custom(); httpClientBuilder.setConnectionManager(connectionManager); if(site.getUserAgent()!=null){ httpClientBuilder.setUserAgent(site.getUserAgent()); }else{ httpClientBuilder.setUserAgent(""); } if(site.isUseGzip()){ httpClientBuilder.addInterceptorFirst(newHttpRequestInterceptor(){ publicvoidprocess( finalHttpRequestrequest, finalHttpContextcontext)throwsHttpException,IOException{ if(!request.containsHeader("Accept-Encoding")){ request.addHeader("Accept-Encoding","gzip"); } } }); } //解决post/redirect/post302跳转问题 httpClientBuilder.setRedirectStrategy(newCustomRedirectStrategy()); SocketConfig.BuildersocketConfigBuilder=SocketConfig.custom(); socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true); socketConfigBuilder.setSoTimeout(site.getTimeOut()); SocketConfigsocketConfig=socketConfigBuilder.build(); httpClientBuilder.setDefaultSocketConfig(socketConfig); connectionManager.setDefaultSocketConfig(socketConfig); httpClientBuilder.setRetryHandler(newDefaultHttpRequestRetryHandler(site.getRetryTimes(),true)); generateCookie(httpClientBuilder,site); returnhttpClientBuilder.build(); } privatevoidgenerateCookie(HttpClientBuilderhttpClientBuilder,Sitesite){ if(site.isDisableCookieManagement()){ httpClientBuilder.disableCookieManagement(); return; } CookieStorecookieStore=newBasicCookieStore(); for(Map.Entry cookieEntry:site.getCookies().entrySet()){ BasicClientCookiecookie=newBasicClientCookie(cookieEntry.getKey(),cookieEntry.getValue()); cookie.setDomain(site.getDomain()); cookieStore.addCookie(cookie); } for(Map.Entry >domainEntry:site.getAllCookies().entrySet()){ for(Map.Entry cookieEntry:domainEntry.getValue().entrySet()){ BasicClientCookiecookie=newBasicClientCookie(cookieEntry.getKey(),cookieEntry.getValue()); cookie.setDomain(domainEntry.getKey()); cookieStore.addCookie(cookie); } } httpClientBuilder.setDefaultCookieStore(cookieStore); } }
好了,到这里基于WebMagic框架实现爬虫、包括jsoup的使用总结就到这里的。
到此这篇关于springBoot+webMagic实现网站爬虫的实例代码的文章就介绍到这了,更多相关springBootwebMagic爬虫内容请搜索毛票票以前的文章或继续浏览下面的相关文章希望大家以后多多支持毛票票!
声明:本文内容来源于网络,版权归原作者所有,内容由互联网用户自发贡献自行上传,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任。如果您发现有涉嫌版权的内容,欢迎发送邮件至:czq8825#qq.com(发邮件时,请将#更换为@)进行举报,并提供相关证据,一经查实,本站将立刻删除涉嫌侵权内容。