Java源码示例:cn.edu.hfut.dmic.webcollector.model.Page

示例1
@Override
public void visit(Page page, CrawlDatums next) {
    try {
        Thread.sleep(sleepTime != null ? sleepTime.longValue() : 5000L);
    } catch (InterruptedException e) {
        log.info("Failed to sleep, e={}", e);
    }
    log.info("Visit {}", page.url());
    if (page.matchType(WxCrawlerConstant.CrawlDatumType.ACCOUNT_SEARCH)) {
        parseSogouSearchResult(page, next);
    } else if (page.matchType(WxCrawlerConstant.CrawlDatumType.ARTICLE_LIST)) {
        parseWxArticleList(page, next);
    } else if (page.matchType(WxCrawlerConstant.CrawlDatumType.ARTICLE_DETAIL)) {
        parseWxArticleDetail(page, next);
    }
}
 
示例2
@Override
public Page getResponse(CrawlDatum crawlDatum) throws Exception {
    Page page = super.getResponse(crawlDatum);
    Response response = page.obj();
    if (response != null) {
        if (currentProxy != null && Proxy.Type.DIRECT.equals(currentProxy.type())) {
            log.info("##### NOW it is DIRECT proxy #####");
            log.info("##### response: {}", response);
        }
        if (response.isRedirect()) {
            log.info("Resource is redirect, code: {}, location: {}", response.code(), response.header("location"));
            removeBadProxy(currentProxy);
        } else {
            List<Cookie> cookies = Cookie.parseAll(response.request().url(), response.headers());
            if (CollectionUtils.isNotEmpty(cookies)) {
                client.cookieJar().saveFromResponse(response.request().url(), cookies);
                addGoodProxy(currentProxy);
            }
        }
    }
    return page;
}
 
示例3
@Override
public void visit(Page page) {
    if(!Pattern.matches(newsUrlRegex, page.getUrl())){
        return;
    }
    Elements contentDivs=page.getDoc().select(contentCSSSelector);
    if(contentDivs.isEmpty())
        return;
    String content=contentDivs.first().text().trim();
    if(content.isEmpty())
        return;
    
    try {
        File file=new File(dir,id.getAndIncrement()+"");
        FileOutputStream fos=new FileOutputStream(file);
        fos.write(content.getBytes("utf-8"));
        fos.close();
      
    } catch (Exception ex) {
        LogUtils.getLogger().info("Exception", ex);
    }
    
}
 
示例4
@Override
public void visit(Page page, CrawlDatums next) {
    // If the http status code is 301 or 302,
    // you have to obtain the redirected url, which is "Location" header of the http response
    // and add it to subsequent tasks by applying "next.add(redirectedUrl)"
    // Since the page may contains metadata,
    // you have to copy it to the added task by "xxxx.meta(page.copyMeta())"
    if(page.code() == 301 || page.code() == 302){
        try {
            // page.location() may be relative url path
            // we have to construct an absolute url path
            String redirectUrl = new URL(new URL(page.url()), page.location()).toExternalForm();
            next.addAndReturn(redirectUrl).meta(page.copyMeta());
        } catch (MalformedURLException e) {
            //the way to handle exceptions in WebCollector
            ExceptionUtils.fail(e);
        }
        return;
    }
    System.out.println("this page is not redirected: " + page.url());
}
 
示例5
@Override
public void visit(Page page, CrawlDatums next) {

    String type=page.meta("type");
    //如果是列表页,抽取内容页链接,放入后续任务中
    if(type.equals("taglist")){
        //可以确定抽取到的链接都指向内容页
        //因此为这些链接添加附加信息(meta):type=content
        next.addAndReturn(page.links("table.tagCol td>a")).meta("type", "booklist");
    }else if(type.equals("booklist")){
        next.addAndReturn(page.links("div.info>h2>a")).meta("type", "content");
    }else if(type.equals("content")){
        //处理内容页,抽取书名和豆瓣评分
        String title=page.select("h1>span").first().text();
        String score=page.select("strong.ll.rating_num").first().text();
        System.out.println("title:"+title+"\tscore:"+score);
    }

}
 
示例6
@Override
public void visit(Page page, CrawlDatums next) {
    String url = page.url();

    if (page.matchType("list")) {
        /*if type is "list"*/
        /*detect content page by css selector and mark their types as "content"*/
        next.add(page.links("h1.lh-condensed>a")).type("content");
    }else if(page.matchType("content")) {
        /*if type is "content"*/
        /*extract title and content of news by css selector*/
        String title = page.select("h1[class=lh-condensed]").first().text();
        String content = page.selectText("div.content.markdown-body");

        //read title_prefix and content_length_limit from configuration
        title = getConf().getString("title_prefix") + title;
        content = content.substring(0, getConf().getInteger("content_length_limit"));

        System.out.println("URL:\n" + url);
        System.out.println("title:\n" + title);
        System.out.println("content:\n" + content);
    }

}
 
示例7
@MatchType(types = "searchEngine")
public void visitSearchEngine(Page page, CrawlDatums next) {
    String keyword = page.meta("keyword");
    int pageIndex = page.metaAsInt("pageIndex");
    System.out.println("成功抓取关键词" + keyword + "的第" + pageIndex + "页搜索结果");
    Elements results = page.select("li.b_algo>h2>a");

    for (int rank = 0; rank < results.size(); rank++) {
        Element result = results.get(rank);
        /*
        我们希望继续爬取每条搜索结果指向的网页,这里统称为外链。
        我们希望在访问外链时仍然能够知道外链处于搜索引擎的第几页、第几条,
        所以将页号和排序信息放入后续的CrawlDatum中,为了能够区分外链和
        搜索引擎结果页面,type设置为outlink,这里的值完全由
        用户定义,可以设置一个任意的值
        */
        String href = result.attr("abs:href");
        next.addAndReturn(href)
                .type("outlink")
                .meta("keyword", keyword)
                .meta("pageIndex", pageIndex)
                .meta("rank", rank);
    }
}
 
示例8
@Override
public void visit(Page page, CrawlDatums next) {

    if(page.matchType("taglist")){
        //如果是列表页,抽取内容页链接
        //将内容页链接的type设置为content,并添加到后续任务中
         next.add(page.links("table.tagCol td>a"),"booklist");
    }else if(page.matchType("booklist")){
        next.add(page.links("div.info>h2>a"),"content");
    }else if(page.matchType("content")){
        //处理内容页,抽取书名和豆瓣评分
        String title=page.select("h1>span").first().text();
        String score=page.select("strong.ll.rating_num").first().text();
        System.out.println("title:"+title+"\tscore:"+score);
    }
 
}
 
示例9
@Override
public void visit(Page page, CrawlDatums next) {
    String url = page.url();
    /*if page is news page*/
    if (page.matchUrl("https://blog.github.com/[0-9]{4}-[0-9]{2}-[0-9]{2}[^/]+/")) {

        /*extract title and content of news by css selector*/
        String title = page.select("h1[class=lh-condensed]").first().text();
        String content = page.selectText("div.content.markdown-body");

        System.out.println("URL:\n" + url);
        System.out.println("title:\n" + title);
        System.out.println("content:\n" + content);

        /*If you want to add urls to crawl,add them to nextLink*/
        /*WebCollector automatically filters links that have been fetched before*/
        /*If autoParse is true and the link you add to nextLinks does not match the 
          regex rules,the link will also been filtered.*/
        //next.add("http://xxxxxx.com");
    }
}
 
示例10
@MatchUrl(urlRegex = "https://blog.github.com/[0-9]{4}-[0-9]{2}-[0-9]{2}[^/]+/")
public void visitNews(Page page, CrawlDatums next) {
    /*extract title and content of news by css selector*/
    String title = page.select("h1[class=lh-condensed]").first().text();
    String content = page.selectText("div.content.markdown-body");

    System.out.println("URL:\n" + page.url());
    System.out.println("title:\n" + title);
    System.out.println("content:\n" + content);

    /*If you want to add urls to crawl,add them to nextLink*/
    /*WebCollector automatically filters links that have been fetched before*/
        /*If autoParse is true and the link you add to nextLinks does not match the
          regex rules,the link will also been filtered.*/
    //next.add("http://xxxxxx.com");
}
 
示例11
static String buildMessage(Visitor visitor, Method method) {

            String fullMethodName = ReflectionUtils.getFullMethodName(method);

            String validMethodFormat = String.format("public void %s(%s param0, %s param1){...}",
                    method.getName(),
                    Page.class.getName(),
                    CrawlDatums.class.getName()
            );
            StringBuilder sb = new StringBuilder("\n\tThe definition of ")
                    .append(fullMethodName)
                    .append(" is invalid,\n")
                    .append("\texpect    \"").append(validMethodFormat).append("\",")
                    .append("\n\tbut found \"")
                    .append(ReflectionUtils.getMethodDeclaration(method)).append("{...}")
                    .append("\"");
            return sb.toString();

        }
 
示例12
/**
 * 解析搜狗的微信公众号搜索结果页
 * @param page
 * @param next
 */
protected void parseSogouSearchResult(Page page, CrawlDatums next){
    String accountName = page.meta(WxCrawlerConstant.CrawlMetaKey.ACCOUNT_NAME);
    int triedCount = page.metaAsInt(WxCrawlerConstant.CrawlMetaKey.TRIED_COUNT);

    // 检查使用不同代理重试次数
    if (triedCount > WxCrawlerConstant.MAX_TRY_COUNT) {
        log.info("Tried so many times using different proxy but all failed" +
                ", skip, accountName:{}", accountName);
        return;
    }
    log.info("Parsing sogou search result page,accountName: {}", accountName);
    Element accountLinkEle = page.select("p.tit>a").first();
    if (accountLinkEle == null) {
        processBlocked(page, next);
        return;
    }
    //防止公众号名错误
    String detectedAccount = accountLinkEle.text().trim();
    if (!accountName.equals(detectedAccount)) {
        log.info("accountName \"{}\" not matched \"{}\"", accountName, detectedAccount);
        return;
    }
    //解析出公众号搜索结果页面中的URL
    String accountUrl = accountLinkEle.attr("abs:href");
    Element wxAccountEl = page.select("p.info>label[name='em_weixinhao']").first();
    if (wxAccountEl == null || StringUtils.isEmpty(wxAccountEl.text())) {
        log.info("accountId \"{}\" not exist", accountName);
        return;
    }
    if(accountUrl.startsWith(WxCrawlerConstant.HTTP_PROTOCOL)) {
        accountUrl = accountUrl.replaceFirst(WxCrawlerConstant.HTTP_PROTOCOL, WxCrawlerConstant.HTTPS_PROTOCOL);
    }
    next.add(new CrawlDatum(accountUrl, WxCrawlerConstant.CrawlDatumType.ARTICLE_LIST)
            .meta(WxCrawlerConstant.CrawlMetaKey.ACCOUNT_NAME, accountName)
            .meta(WxCrawlerConstant.CrawlMetaKey.ACCOUNT_ID, wxAccountEl.text())
            .meta(WxCrawlerConstant.CrawlMetaKey.TRIED_COUNT, 0));
}
 
示例13
private List<ArticleSummaryObj> parseArticleListByPage(Page page) throws Exception {
    int startIndex = page.html().indexOf(WxCrawlerConstant.ArticleList.ARTICLE_LIST_KEY) +
            WxCrawlerConstant.ArticleList.ARTICLE_LIST_KEY.length();
    int endIndex = page.html().indexOf(WxCrawlerConstant.ArticleList.ARTICLE_LIST_SUFFIX);
    String jsonStr = page.html().substring(startIndex, endIndex).trim();
    jsonStr = jsonStr.substring(0,jsonStr.length()-1);
    JSONObject json = JSONObject.parseObject(jsonStr);
    return JSONArray.parseArray(json.getString("list"), ArticleSummaryObj.class);
}
 
示例14
/**
 * 解析微信公众号主页文章列表
 * @param page
 * @param next
 */
protected void parseWxArticleList(Page page, CrawlDatums next){
    String accountName = page.meta(WxCrawlerConstant.CrawlMetaKey.ACCOUNT_NAME);
    log.info("Parsing weixin article list page,accountName:{}", accountName);
    String accountId = page.meta(WxCrawlerConstant.CrawlMetaKey.ACCOUNT_ID);
    int triedCount = page.metaAsInt(WxCrawlerConstant.CrawlMetaKey.TRIED_COUNT);

    // Step 1: 检查使用不同代理重试次数
    if (triedCount > WxCrawlerConstant.MAX_TRY_COUNT) {
        log.info("Tried so many times using different proxy but all failed" +
                ", skip, accountName:{}", accountName);
        return;
    }

    // Step 2: 获取文章列表
    List<ArticleSummaryObj> articles = null;
    try {
        articles = parseArticleListByPage(page);
    } catch(Exception e1) {
        log.info("Need to enter identifying code, {}", page.url());
        processBlocked(page, next);
        return ;
    }

    // Step 3: 解析文章详情,加入爬虫种子
    ResultBase<List<ArticleTransferVO>> articleTransferResult = wxCrawlService.parseArticleList(accountId, accountName, articles);
    if(articleTransferResult.isSuccess() && CollectionUtils.isNotEmpty(articleTransferResult.getValue())) {
        articleTransferResult.getValue().forEach(article -> {
            CrawlDatum crawlDatum = parseArticleSummary(article);
            if (crawlDatum != null) {
                next.add(crawlDatum);
            }
        });
    }
}
 
示例15
@Override
public void visit(Page page, CrawlDatums next) {
    String url = page.getUrl();
    
    if (page.matchUrl("http://news.cqu.edu.cn/news/article/*.*html")) {
       
        String title = page.select("div[class=title]>h1").first().text();
        String content = page.select("div#zoom", 0).text();
        String time = page.select("span[class=datetime]").text();
        String desc = page.select("div[class=description]>p").text();
        String tag = page.select("div#location a:nth-child(2)").text();  
       
        NewsDetailModel bean = new NewsDetailModel();
		bean.setUrl(url);
		bean.setId(ParseMD5.parseStrToMD5(bean.getUrl()));
		bean.setTitle(title);
		bean.setContent(content);
		bean.setTime(time);
  
		NewsDao db = new NewsDao();
        db.saveNewsInfos(bean);
                   
        System.out.println("url:\n" + url);
        System.out.println("title:\n" + title);
        System.out.println("time:\n" + time);
        System.out.println("desc:\n" + desc);
        System.out.println("content:\n" + content);
        System.out.println("tag:\n" + tag);
        
        System.out.println("----------------------------------------------------------");
    }
}
 
示例16
@AfterParse
public void afterParse(Page page, CrawlDatums next) {
    //当前页面的depth为x,则从当前页面解析的后续任务的depth为x+1
    int depth = 1;
    //如果在添加种子时忘记添加depth信息,可以通过这种方式保证程序不出错
    try {
        depth = page.metaAsInt("depth");
    } catch (Exception ex) {

    }
    depth++;
    next.meta("depth", depth);
}
 
示例17
@MatchType(types = "content")
public void visitContent(Page page, CrawlDatums next) {
    //处理内容页,抽取书名和豆瓣评分
    String title=page.select("h1>span").first().text();
    String score=page.select("strong.ll.rating_num").first().text();
    System.out.println("title:"+title+"\tscore:"+score);
}
 
示例18
@Override
public void visit(Page page, CrawlDatums next) {
    if (page.matchUrl("https://blog.csdn.net/.*/article/details/.*")) {
        String title = page.select("h1.title-article").first().text();
        String author = page.select("a#uid").first().text();
        System.out.println("title:" + title + "\tauthor:" + author);
    }
}
 
示例19
@MatchType(types = "content")
public void visitContent(Page page, CrawlDatums next){
    /*if type is "content"*/
    /*extract title and content of news by css selector*/
    String title = page.select("h1[class=lh-condensed]").first().text();
    String content = page.selectText("div.content.markdown-body");

    //read title_prefix and content_length_limit from configuration
    title = getConf().getString("title_prefix") + title;
    content = content.substring(0, getConf().getInteger("content_length_limit"));

    System.out.println("URL:\n" + page.url());
    System.out.println("title:\n" + title);
    System.out.println("content:\n" + content);
}
 
示例20
@MatchCode(codes = {301, 302})
public void visitRedirect(Page page, CrawlDatums next){
    try {
        // page.location() may be relative url path
        // we have to construct an absolute url path
        String redirectUrl = new URL(new URL(page.url()), page.location()).toExternalForm();
        next.addAndReturn(redirectUrl).meta(page.copyMeta());
    } catch (MalformedURLException e) {
        //the way to handle exceptions in WebCollector
        ExceptionUtils.fail(e);
    }
}
 
示例21
@MatchType(types = "outlink")
public void visitOutlink(Page page, CrawlDatums next) {
    int depth = page.metaAsInt("depth");
    int pageIndex = page.metaAsInt("pageIndex");
    int rank = page.metaAsInt("rank");
    String referer=page.meta("referer");

    String line = String.format("第%s页第%s个结果:%s(%s字节)\tdepth=%s\treferer=%s",
            pageIndex, rank + 1, page.doc().title(),page.content().length, depth, referer);
    System.out.println(line);
}
 
示例22
@MatchCode(codes = {301, 302})
public void visitRedirect(Page page, CrawlDatums next){
    try {
        // page.location() may be relative url path
        // we have to construct an absolute url path
        String redirectUrl = new URL(new URL(page.url()), page.location()).toExternalForm();
        next.addAndReturn(redirectUrl).meta(page.copyMeta());
    } catch (MalformedURLException e) {
        //the way to handle exceptions in WebCollector
        ExceptionUtils.fail(e);
    }
}
 
示例23
@Override
public void visit(Page page, CrawlDatums next) {
    try {
        this.myMethod();
    } catch (Exception e) {
        // 当捕捉到异常时,且认为这个网页需要重新爬取时
        // 应该使用ExceptionUtils.fail(e)
        // 无视或者throw异常在编译时会报错,因为visit方法没有throws异常
        // 该方法会抛出RuntimeException,不会强制要求visit方法加上throws
        ExceptionUtils.fail(e);
    }
}
 
示例24
@Override
public void visit(Page page, CrawlDatums next) {
    if (page.matchType("content")) {
        String title = page.select("h1.title-article").first().text();
        String author = page.select("p.name>a.text-truncate").first().text();
        System.out.println("title:" + title + "\tauthor:" + author);
    }
}
 
示例25
public void checkMethod(Method method) throws Exception {
    Class[] paramTypes = method.getParameterTypes();
    if(paramTypes.length != 2){
        throw new Visitor.InvalidAnnotatedVisitorMethodException(visitor, method);
    }


    if(!paramTypes[0].equals(Page.class)){
        throw new Visitor.InvalidAnnotatedVisitorMethodException(visitor, method);
    }

    if(!paramTypes[1].equals(CrawlDatums.class)){
        throw new Visitor.InvalidAnnotatedVisitorMethodException(visitor, method);
    }
}
 
示例26
public Method getMethodByUrlRegex(Page page){
    for(Map.Entry<String, Method> entry: urlRegexMethodMap.entrySet()){
        String urlRegex = entry.getKey();
        if(page.matchUrl(urlRegex)){
            return entry.getValue();
        }
    }
    return null;
}
 
示例27
public Method getMethodByUrlRegexRule(Page page){
    for(Map.Entry<RegexRule, Method> entry: urlRegexRuleMethodMap.entrySet()){
        RegexRule regexRule = entry.getKey();
        if(page.matchUrlRegexRule(regexRule)){
            return entry.getValue();
        }
    }
    return null;
}
 
示例28
public void dispatch(Page page, CrawlDatums next) throws InvocationTargetException, IllegalAccessException {
        HashSet<Method> invokedMethods = new HashSet<Method>();

        if(beforeVisitMethod != null){
            beforeVisitMethod.invoke(visitor, page, next);
        }

        Method method;
        method = getMethodByCode(page);
        if(method == null){
            method = getMethodByType(page);
        }
        if(method == null){
            method = getMethodByUrlRegex(page);
        }
        if(method == null){
            method = getMethodByUrlRegexRule(page);
        }
        if(method == null){
            method = visitMethod;
        }
        method.invoke(visitor, page, next);
//        visitor.visit(page, next);

        if (autoParse && !regexRule.isEmpty()) {
            parseLink(page, next);
        }

        if(afterParseMethod != null){
            afterParseMethod.invoke(visitor, page, next);
        }
    }
 
示例29
protected void parseLink(Page page, CrawlDatums next) {
    String conteType = page.contentType();
    if (conteType != null && conteType.contains("text/html")) {
        Document doc = page.doc();
        if (doc != null) {
            Links links = new Links().addByRegex(doc, regexRule, getConf().getAutoDetectImg());
            next.add(links);
        }
    }

}
 
示例30
@Override
    public void execute(CrawlDatum datum, CrawlDatums next) throws Exception {
        Page page = requester.getResponse(datum);
//        visitor.visit(page, next);
        visitorMethodDispatcher.dispatch(page, next);

    }