Java源码示例:cn.edu.hfut.dmic.webcollector.model.Page
示例1
@Override
public void visit(Page page, CrawlDatums next) {
try {
Thread.sleep(sleepTime != null ? sleepTime.longValue() : 5000L);
} catch (InterruptedException e) {
log.info("Failed to sleep, e={}", e);
}
log.info("Visit {}", page.url());
if (page.matchType(WxCrawlerConstant.CrawlDatumType.ACCOUNT_SEARCH)) {
parseSogouSearchResult(page, next);
} else if (page.matchType(WxCrawlerConstant.CrawlDatumType.ARTICLE_LIST)) {
parseWxArticleList(page, next);
} else if (page.matchType(WxCrawlerConstant.CrawlDatumType.ARTICLE_DETAIL)) {
parseWxArticleDetail(page, next);
}
}
示例2
@Override
public Page getResponse(CrawlDatum crawlDatum) throws Exception {
Page page = super.getResponse(crawlDatum);
Response response = page.obj();
if (response != null) {
if (currentProxy != null && Proxy.Type.DIRECT.equals(currentProxy.type())) {
log.info("##### NOW it is DIRECT proxy #####");
log.info("##### response: {}", response);
}
if (response.isRedirect()) {
log.info("Resource is redirect, code: {}, location: {}", response.code(), response.header("location"));
removeBadProxy(currentProxy);
} else {
List<Cookie> cookies = Cookie.parseAll(response.request().url(), response.headers());
if (CollectionUtils.isNotEmpty(cookies)) {
client.cookieJar().saveFromResponse(response.request().url(), cookies);
addGoodProxy(currentProxy);
}
}
}
return page;
}
示例3
@Override
public void visit(Page page) {
if(!Pattern.matches(newsUrlRegex, page.getUrl())){
return;
}
Elements contentDivs=page.getDoc().select(contentCSSSelector);
if(contentDivs.isEmpty())
return;
String content=contentDivs.first().text().trim();
if(content.isEmpty())
return;
try {
File file=new File(dir,id.getAndIncrement()+"");
FileOutputStream fos=new FileOutputStream(file);
fos.write(content.getBytes("utf-8"));
fos.close();
} catch (Exception ex) {
LogUtils.getLogger().info("Exception", ex);
}
}
示例4
@Override
public void visit(Page page, CrawlDatums next) {
// If the http status code is 301 or 302,
// you have to obtain the redirected url, which is "Location" header of the http response
// and add it to subsequent tasks by applying "next.add(redirectedUrl)"
// Since the page may contains metadata,
// you have to copy it to the added task by "xxxx.meta(page.copyMeta())"
if(page.code() == 301 || page.code() == 302){
try {
// page.location() may be relative url path
// we have to construct an absolute url path
String redirectUrl = new URL(new URL(page.url()), page.location()).toExternalForm();
next.addAndReturn(redirectUrl).meta(page.copyMeta());
} catch (MalformedURLException e) {
//the way to handle exceptions in WebCollector
ExceptionUtils.fail(e);
}
return;
}
System.out.println("this page is not redirected: " + page.url());
}
示例5
@Override
public void visit(Page page, CrawlDatums next) {
String type=page.meta("type");
//如果是列表页,抽取内容页链接,放入后续任务中
if(type.equals("taglist")){
//可以确定抽取到的链接都指向内容页
//因此为这些链接添加附加信息(meta):type=content
next.addAndReturn(page.links("table.tagCol td>a")).meta("type", "booklist");
}else if(type.equals("booklist")){
next.addAndReturn(page.links("div.info>h2>a")).meta("type", "content");
}else if(type.equals("content")){
//处理内容页,抽取书名和豆瓣评分
String title=page.select("h1>span").first().text();
String score=page.select("strong.ll.rating_num").first().text();
System.out.println("title:"+title+"\tscore:"+score);
}
}
示例6
@Override
public void visit(Page page, CrawlDatums next) {
String url = page.url();
if (page.matchType("list")) {
/*if type is "list"*/
/*detect content page by css selector and mark their types as "content"*/
next.add(page.links("h1.lh-condensed>a")).type("content");
}else if(page.matchType("content")) {
/*if type is "content"*/
/*extract title and content of news by css selector*/
String title = page.select("h1[class=lh-condensed]").first().text();
String content = page.selectText("div.content.markdown-body");
//read title_prefix and content_length_limit from configuration
title = getConf().getString("title_prefix") + title;
content = content.substring(0, getConf().getInteger("content_length_limit"));
System.out.println("URL:\n" + url);
System.out.println("title:\n" + title);
System.out.println("content:\n" + content);
}
}
示例7
@MatchType(types = "searchEngine")
public void visitSearchEngine(Page page, CrawlDatums next) {
String keyword = page.meta("keyword");
int pageIndex = page.metaAsInt("pageIndex");
System.out.println("成功抓取关键词" + keyword + "的第" + pageIndex + "页搜索结果");
Elements results = page.select("li.b_algo>h2>a");
for (int rank = 0; rank < results.size(); rank++) {
Element result = results.get(rank);
/*
我们希望继续爬取每条搜索结果指向的网页,这里统称为外链。
我们希望在访问外链时仍然能够知道外链处于搜索引擎的第几页、第几条,
所以将页号和排序信息放入后续的CrawlDatum中,为了能够区分外链和
搜索引擎结果页面,type设置为outlink,这里的值完全由
用户定义,可以设置一个任意的值
*/
String href = result.attr("abs:href");
next.addAndReturn(href)
.type("outlink")
.meta("keyword", keyword)
.meta("pageIndex", pageIndex)
.meta("rank", rank);
}
}
示例8
@Override
public void visit(Page page, CrawlDatums next) {
if(page.matchType("taglist")){
//如果是列表页,抽取内容页链接
//将内容页链接的type设置为content,并添加到后续任务中
next.add(page.links("table.tagCol td>a"),"booklist");
}else if(page.matchType("booklist")){
next.add(page.links("div.info>h2>a"),"content");
}else if(page.matchType("content")){
//处理内容页,抽取书名和豆瓣评分
String title=page.select("h1>span").first().text();
String score=page.select("strong.ll.rating_num").first().text();
System.out.println("title:"+title+"\tscore:"+score);
}
}
示例9
@Override
public void visit(Page page, CrawlDatums next) {
String url = page.url();
/*if page is news page*/
if (page.matchUrl("https://blog.github.com/[0-9]{4}-[0-9]{2}-[0-9]{2}[^/]+/")) {
/*extract title and content of news by css selector*/
String title = page.select("h1[class=lh-condensed]").first().text();
String content = page.selectText("div.content.markdown-body");
System.out.println("URL:\n" + url);
System.out.println("title:\n" + title);
System.out.println("content:\n" + content);
/*If you want to add urls to crawl,add them to nextLink*/
/*WebCollector automatically filters links that have been fetched before*/
/*If autoParse is true and the link you add to nextLinks does not match the
regex rules,the link will also been filtered.*/
//next.add("http://xxxxxx.com");
}
}
示例10
@MatchUrl(urlRegex = "https://blog.github.com/[0-9]{4}-[0-9]{2}-[0-9]{2}[^/]+/")
public void visitNews(Page page, CrawlDatums next) {
/*extract title and content of news by css selector*/
String title = page.select("h1[class=lh-condensed]").first().text();
String content = page.selectText("div.content.markdown-body");
System.out.println("URL:\n" + page.url());
System.out.println("title:\n" + title);
System.out.println("content:\n" + content);
/*If you want to add urls to crawl,add them to nextLink*/
/*WebCollector automatically filters links that have been fetched before*/
/*If autoParse is true and the link you add to nextLinks does not match the
regex rules,the link will also been filtered.*/
//next.add("http://xxxxxx.com");
}
示例11
static String buildMessage(Visitor visitor, Method method) {
String fullMethodName = ReflectionUtils.getFullMethodName(method);
String validMethodFormat = String.format("public void %s(%s param0, %s param1){...}",
method.getName(),
Page.class.getName(),
CrawlDatums.class.getName()
);
StringBuilder sb = new StringBuilder("\n\tThe definition of ")
.append(fullMethodName)
.append(" is invalid,\n")
.append("\texpect \"").append(validMethodFormat).append("\",")
.append("\n\tbut found \"")
.append(ReflectionUtils.getMethodDeclaration(method)).append("{...}")
.append("\"");
return sb.toString();
}
示例12
/**
* 解析搜狗的微信公众号搜索结果页
* @param page
* @param next
*/
protected void parseSogouSearchResult(Page page, CrawlDatums next){
String accountName = page.meta(WxCrawlerConstant.CrawlMetaKey.ACCOUNT_NAME);
int triedCount = page.metaAsInt(WxCrawlerConstant.CrawlMetaKey.TRIED_COUNT);
// 检查使用不同代理重试次数
if (triedCount > WxCrawlerConstant.MAX_TRY_COUNT) {
log.info("Tried so many times using different proxy but all failed" +
", skip, accountName:{}", accountName);
return;
}
log.info("Parsing sogou search result page,accountName: {}", accountName);
Element accountLinkEle = page.select("p.tit>a").first();
if (accountLinkEle == null) {
processBlocked(page, next);
return;
}
//防止公众号名错误
String detectedAccount = accountLinkEle.text().trim();
if (!accountName.equals(detectedAccount)) {
log.info("accountName \"{}\" not matched \"{}\"", accountName, detectedAccount);
return;
}
//解析出公众号搜索结果页面中的URL
String accountUrl = accountLinkEle.attr("abs:href");
Element wxAccountEl = page.select("p.info>label[name='em_weixinhao']").first();
if (wxAccountEl == null || StringUtils.isEmpty(wxAccountEl.text())) {
log.info("accountId \"{}\" not exist", accountName);
return;
}
if(accountUrl.startsWith(WxCrawlerConstant.HTTP_PROTOCOL)) {
accountUrl = accountUrl.replaceFirst(WxCrawlerConstant.HTTP_PROTOCOL, WxCrawlerConstant.HTTPS_PROTOCOL);
}
next.add(new CrawlDatum(accountUrl, WxCrawlerConstant.CrawlDatumType.ARTICLE_LIST)
.meta(WxCrawlerConstant.CrawlMetaKey.ACCOUNT_NAME, accountName)
.meta(WxCrawlerConstant.CrawlMetaKey.ACCOUNT_ID, wxAccountEl.text())
.meta(WxCrawlerConstant.CrawlMetaKey.TRIED_COUNT, 0));
}
示例13
private List<ArticleSummaryObj> parseArticleListByPage(Page page) throws Exception {
int startIndex = page.html().indexOf(WxCrawlerConstant.ArticleList.ARTICLE_LIST_KEY) +
WxCrawlerConstant.ArticleList.ARTICLE_LIST_KEY.length();
int endIndex = page.html().indexOf(WxCrawlerConstant.ArticleList.ARTICLE_LIST_SUFFIX);
String jsonStr = page.html().substring(startIndex, endIndex).trim();
jsonStr = jsonStr.substring(0,jsonStr.length()-1);
JSONObject json = JSONObject.parseObject(jsonStr);
return JSONArray.parseArray(json.getString("list"), ArticleSummaryObj.class);
}
示例14
/**
* 解析微信公众号主页文章列表
* @param page
* @param next
*/
protected void parseWxArticleList(Page page, CrawlDatums next){
String accountName = page.meta(WxCrawlerConstant.CrawlMetaKey.ACCOUNT_NAME);
log.info("Parsing weixin article list page,accountName:{}", accountName);
String accountId = page.meta(WxCrawlerConstant.CrawlMetaKey.ACCOUNT_ID);
int triedCount = page.metaAsInt(WxCrawlerConstant.CrawlMetaKey.TRIED_COUNT);
// Step 1: 检查使用不同代理重试次数
if (triedCount > WxCrawlerConstant.MAX_TRY_COUNT) {
log.info("Tried so many times using different proxy but all failed" +
", skip, accountName:{}", accountName);
return;
}
// Step 2: 获取文章列表
List<ArticleSummaryObj> articles = null;
try {
articles = parseArticleListByPage(page);
} catch(Exception e1) {
log.info("Need to enter identifying code, {}", page.url());
processBlocked(page, next);
return ;
}
// Step 3: 解析文章详情,加入爬虫种子
ResultBase<List<ArticleTransferVO>> articleTransferResult = wxCrawlService.parseArticleList(accountId, accountName, articles);
if(articleTransferResult.isSuccess() && CollectionUtils.isNotEmpty(articleTransferResult.getValue())) {
articleTransferResult.getValue().forEach(article -> {
CrawlDatum crawlDatum = parseArticleSummary(article);
if (crawlDatum != null) {
next.add(crawlDatum);
}
});
}
}
示例15
@Override
public void visit(Page page, CrawlDatums next) {
String url = page.getUrl();
if (page.matchUrl("http://news.cqu.edu.cn/news/article/*.*html")) {
String title = page.select("div[class=title]>h1").first().text();
String content = page.select("div#zoom", 0).text();
String time = page.select("span[class=datetime]").text();
String desc = page.select("div[class=description]>p").text();
String tag = page.select("div#location a:nth-child(2)").text();
NewsDetailModel bean = new NewsDetailModel();
bean.setUrl(url);
bean.setId(ParseMD5.parseStrToMD5(bean.getUrl()));
bean.setTitle(title);
bean.setContent(content);
bean.setTime(time);
NewsDao db = new NewsDao();
db.saveNewsInfos(bean);
System.out.println("url:\n" + url);
System.out.println("title:\n" + title);
System.out.println("time:\n" + time);
System.out.println("desc:\n" + desc);
System.out.println("content:\n" + content);
System.out.println("tag:\n" + tag);
System.out.println("----------------------------------------------------------");
}
}
示例16
@AfterParse
public void afterParse(Page page, CrawlDatums next) {
//当前页面的depth为x,则从当前页面解析的后续任务的depth为x+1
int depth = 1;
//如果在添加种子时忘记添加depth信息,可以通过这种方式保证程序不出错
try {
depth = page.metaAsInt("depth");
} catch (Exception ex) {
}
depth++;
next.meta("depth", depth);
}
示例17
@MatchType(types = "content")
public void visitContent(Page page, CrawlDatums next) {
//处理内容页,抽取书名和豆瓣评分
String title=page.select("h1>span").first().text();
String score=page.select("strong.ll.rating_num").first().text();
System.out.println("title:"+title+"\tscore:"+score);
}
示例18
@Override
public void visit(Page page, CrawlDatums next) {
if (page.matchUrl("https://blog.csdn.net/.*/article/details/.*")) {
String title = page.select("h1.title-article").first().text();
String author = page.select("a#uid").first().text();
System.out.println("title:" + title + "\tauthor:" + author);
}
}
示例19
@MatchType(types = "content")
public void visitContent(Page page, CrawlDatums next){
/*if type is "content"*/
/*extract title and content of news by css selector*/
String title = page.select("h1[class=lh-condensed]").first().text();
String content = page.selectText("div.content.markdown-body");
//read title_prefix and content_length_limit from configuration
title = getConf().getString("title_prefix") + title;
content = content.substring(0, getConf().getInteger("content_length_limit"));
System.out.println("URL:\n" + page.url());
System.out.println("title:\n" + title);
System.out.println("content:\n" + content);
}
示例20
@MatchCode(codes = {301, 302})
public void visitRedirect(Page page, CrawlDatums next){
try {
// page.location() may be relative url path
// we have to construct an absolute url path
String redirectUrl = new URL(new URL(page.url()), page.location()).toExternalForm();
next.addAndReturn(redirectUrl).meta(page.copyMeta());
} catch (MalformedURLException e) {
//the way to handle exceptions in WebCollector
ExceptionUtils.fail(e);
}
}
示例21
@MatchType(types = "outlink")
public void visitOutlink(Page page, CrawlDatums next) {
int depth = page.metaAsInt("depth");
int pageIndex = page.metaAsInt("pageIndex");
int rank = page.metaAsInt("rank");
String referer=page.meta("referer");
String line = String.format("第%s页第%s个结果:%s(%s字节)\tdepth=%s\treferer=%s",
pageIndex, rank + 1, page.doc().title(),page.content().length, depth, referer);
System.out.println(line);
}
示例22
@MatchCode(codes = {301, 302})
public void visitRedirect(Page page, CrawlDatums next){
try {
// page.location() may be relative url path
// we have to construct an absolute url path
String redirectUrl = new URL(new URL(page.url()), page.location()).toExternalForm();
next.addAndReturn(redirectUrl).meta(page.copyMeta());
} catch (MalformedURLException e) {
//the way to handle exceptions in WebCollector
ExceptionUtils.fail(e);
}
}
示例23
@Override
public void visit(Page page, CrawlDatums next) {
try {
this.myMethod();
} catch (Exception e) {
// 当捕捉到异常时,且认为这个网页需要重新爬取时
// 应该使用ExceptionUtils.fail(e)
// 无视或者throw异常在编译时会报错,因为visit方法没有throws异常
// 该方法会抛出RuntimeException,不会强制要求visit方法加上throws
ExceptionUtils.fail(e);
}
}
示例24
@Override
public void visit(Page page, CrawlDatums next) {
if (page.matchType("content")) {
String title = page.select("h1.title-article").first().text();
String author = page.select("p.name>a.text-truncate").first().text();
System.out.println("title:" + title + "\tauthor:" + author);
}
}
示例25
public void checkMethod(Method method) throws Exception {
Class[] paramTypes = method.getParameterTypes();
if(paramTypes.length != 2){
throw new Visitor.InvalidAnnotatedVisitorMethodException(visitor, method);
}
if(!paramTypes[0].equals(Page.class)){
throw new Visitor.InvalidAnnotatedVisitorMethodException(visitor, method);
}
if(!paramTypes[1].equals(CrawlDatums.class)){
throw new Visitor.InvalidAnnotatedVisitorMethodException(visitor, method);
}
}
示例26
public Method getMethodByUrlRegex(Page page){
for(Map.Entry<String, Method> entry: urlRegexMethodMap.entrySet()){
String urlRegex = entry.getKey();
if(page.matchUrl(urlRegex)){
return entry.getValue();
}
}
return null;
}
示例27
public Method getMethodByUrlRegexRule(Page page){
for(Map.Entry<RegexRule, Method> entry: urlRegexRuleMethodMap.entrySet()){
RegexRule regexRule = entry.getKey();
if(page.matchUrlRegexRule(regexRule)){
return entry.getValue();
}
}
return null;
}
示例28
public void dispatch(Page page, CrawlDatums next) throws InvocationTargetException, IllegalAccessException {
HashSet<Method> invokedMethods = new HashSet<Method>();
if(beforeVisitMethod != null){
beforeVisitMethod.invoke(visitor, page, next);
}
Method method;
method = getMethodByCode(page);
if(method == null){
method = getMethodByType(page);
}
if(method == null){
method = getMethodByUrlRegex(page);
}
if(method == null){
method = getMethodByUrlRegexRule(page);
}
if(method == null){
method = visitMethod;
}
method.invoke(visitor, page, next);
// visitor.visit(page, next);
if (autoParse && !regexRule.isEmpty()) {
parseLink(page, next);
}
if(afterParseMethod != null){
afterParseMethod.invoke(visitor, page, next);
}
}
示例29
protected void parseLink(Page page, CrawlDatums next) {
String conteType = page.contentType();
if (conteType != null && conteType.contains("text/html")) {
Document doc = page.doc();
if (doc != null) {
Links links = new Links().addByRegex(doc, regexRule, getConf().getAutoDetectImg());
next.add(links);
}
}
}
示例30
@Override
public void execute(CrawlDatum datum, CrawlDatums next) throws Exception {
Page page = requester.getResponse(datum);
// visitor.visit(page, next);
visitorMethodDispatcher.dispatch(page, next);
}