Java源码示例:crawlercommons.sitemaps.SiteMapIndex
示例1
private Collection<SiteMapURL> parse(URI siteMapUrl, crawlercommons.sitemaps.SiteMapParser siteMapParser)
throws SiteMapParseException
{
try
{
HttpClientContext context = new HttpClientContext();
HttpResponse response = httpClient.doHttpGet(siteMapUrl, context);
URI cleanSiteMapUrl = UriUtils.removeUserInfo(getBaseUri(context, siteMapUrl));
AbstractSiteMap siteMap = siteMapParser.parseSiteMap(response.getResponseBody(), cleanSiteMapUrl.toURL());
if (siteMap.getType() == SitemapType.INDEX)
{
List<SiteMapURL> siteMapUrls = new LinkedList<>();
for (AbstractSiteMap siteMapFromIndex : ((SiteMapIndex) siteMap).getSitemaps())
{
siteMapUrls.addAll(parse(siteMapFromIndex.getUrl().toURI(), siteMapParser));
}
return siteMapUrls;
}
return ((SiteMap) siteMap).getSiteMapUrls();
}
catch (IOException | UnknownFormatException | URISyntaxException e)
{
throw new SiteMapParseException(e.getMessage(), e);
}
}
示例2
@Override
public ParserResult parse(FetchResultUrl fetchedUrl) throws Exception {
if (LOGGER.isTraceEnabled()) {
LOGGER.trace("Parsing sitemap '{}'", fetchedUrl.getFetchedUrl());
}
AbstractSiteMap parsedSiteMap = _siteMapParser.parseSiteMap(fetchedUrl.getContent(),
new URL(fetchedUrl.getUrl()));
if (parsedSiteMap instanceof SiteMap) {
Collection<SiteMapURL> siteMapUrls = ((SiteMap) parsedSiteMap).getSiteMapUrls();
ArrayList<ExtractedUrl> extractedUrls = new ArrayList<ExtractedUrl>();
for (SiteMapURL siteMapURL : siteMapUrls) {
extractedUrls.add(new ExtractedUrl(siteMapURL.getUrl().toExternalForm()));
}
return new ParserResult(null,
extractedUrls.toArray(new ExtractedUrl[extractedUrls.size()]));
} else {
if (parsedSiteMap instanceof SiteMapIndex) {
// Log this - so we can deal with this in the future
LOGGER.info("Unexpected SiteMapIndex encountered while parsing sitemap url: "
+ fetchedUrl.getFetchedUrl());
} else {
LOGGER.warn("Unknown type for AbstractSiteMap encountered: "
+ parsedSiteMap.getClass().getName());
}
}
return null;
}
示例3
XMLIndexHandler(URL url, LinkedList<String> elementStack, boolean strict) {
super(elementStack, strict);
sitemap = new SiteMapIndex(url);
sitemap.setType(SitemapType.INDEX);
}