Java源码示例:org.archive.io.ArchiveReader

示例1
protected ArchiveReader getArchiveReader(final File arcFile,
           final boolean skipSuffixTest, final long offset)
   throws IOException {
       boolean compressed = testCompressedARCFile(arcFile, skipSuffixTest);
       if (!compressed) {
           if (!FileUtils.isReadableWithExtensionAndMagic(arcFile,
                   ARC_FILE_EXTENSION, ARC_MAGIC_NUMBER)) {
               throw new IOException(arcFile.getAbsolutePath() +
                   " is not an Internet Archive ARC file.");
           }
       }
       return compressed?
           (ARCReader)ARCReaderFactory.factory.
               new CompressedARCReader(arcFile, offset):
           (ARCReader)ARCReaderFactory.factory.
               new UncompressedARCReader(arcFile, offset);
}
 
示例2
private void offsetResourceTest( File testfile, long offset, String uri ) throws Exception {
  	RandomAccessFile raf = new RandomAccessFile(testfile, "r");
raf.seek(offset);
InputStream is = new FileInputStream(raf.getFD());
String fPath = testfile.getAbsolutePath();
ArchiveReader reader = ARCReaderFactory.get(fPath, is, false);    	
// This one works:
//ArchiveReader reader = ARCReaderFactory.get(testfile, offset);
ArchiveRecord record = reader.get();

final String url = record.getHeader().getUrl();
assertEquals("URL of record is not as expected.", uri, url);

      final long position = record.getPosition();
      final long recordLength = record.getHeader().getLength();
      assertTrue("Position " + position + " is after end of record " + recordLength, position <= recordLength);

      // Clean up:
      if( raf != null )
      	raf.close();
  }
 
示例3
/**
 * Create and return the index of the ArcHarvestFile.
 * @param baseDir the base directory of the arcs
 * @throws IOException thrown if there is an error
 * @throws ParseException 
 */
public Map<String, HarvestResourceDTO> index(File baseDir) throws IOException, ParseException {
	Map<String, HarvestResourceDTO> results = new HashMap<String, HarvestResourceDTO>();
	
	File theArchiveFile = new File(baseDir, this.getName());
	ArchiveReader reader = ArchiveReaderFactory.get(theArchiveFile);
	this.compressed = reader.isCompressed();
	
	Iterator<ArchiveRecord> it = reader.iterator();
	while(it.hasNext()) {
		ArchiveRecord rec = it.next();
		
		if(rec instanceof WARCRecord) {
			String type = rec.getHeader().getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
			if(type.equals(WARCConstants.RESPONSE)) {
				String mime = rec.getHeader().getMimetype();
				if(!mime.equals("text/dns")) {
					indexWARCResponse(rec, results);
				}
			}
		}
		else {
			indexARCRecord(rec, results);
		}
	}
	reader.close();
	
	return results;
}
 
示例4
public static void main(String[] args) throws IOException, S3ServiceException {
	// We're accessing a publicly available bucket so don't need to fill in our credentials
	S3Service s3s = new RestS3Service(null);
	
	// Let's grab a file out of the CommonCrawl S3 bucket
	String fn = "common-crawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/warc/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
	S3Object f = s3s.getObject("aws-publicdatasets", fn, null, null, null, null, null, null);
	
	// The file name identifies the ArchiveReader and indicates if it should be decompressed
	ArchiveReader ar = WARCReaderFactory.get(fn, f.getDataInputStream(), true);
	
	// Once we have an ArchiveReader, we can work through each of the records it contains
	int i = 0;
	for(ArchiveRecord r : ar) {
		// The header file contains information such as the type of record, size, creation time, and URL
		System.out.println("Header: " + r.getHeader());
		System.out.println("URL: " + r.getHeader().getUrl());
		System.out.println();
		
		// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
		// Create a byte array that is as long as all the record's stated length
		byte[] rawData = new byte[r.available()];
		r.read(rawData);
		// Note: potential optimization would be to have a large buffer only allocated once
		
		// Why don't we convert it to a string and print the start of it? Let's hope it's text!
		String content = new String(rawData);
		System.out.println(content.substring(0, Math.min(500, content.length())));
		System.out.println((content.length() > 500 ? "..." : ""));
		
		// Pretty printing to make the output more readable 
		System.out.println("=-=-=-=-=-=-=-=-=");
		if (i++ > 4) break; 
	}
}
 
示例5
/**
 * @param args
 * @throws IOException 
 */
public static void main(String[] args) throws IOException {
	// Set up a local compressed WARC file for reading 
	String fn = "data/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
	FileInputStream is = new FileInputStream(fn);
	// The file name identifies the ArchiveReader and indicates if it should be decompressed
	ArchiveReader ar = WARCReaderFactory.get(fn, is, true);
	
	// Once we have an ArchiveReader, we can work through each of the records it contains
	int i = 0;
	for(ArchiveRecord r : ar) {
		// The header file contains information such as the type of record, size, creation time, and URL
		System.out.println(r.getHeader());
		System.out.println(r.getHeader().getUrl());
		System.out.println();
		
		// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
		// Create a byte array that is as long as the record's stated length
		byte[] rawData = IOUtils.toByteArray(r, r.available());
		
		// Why don't we convert it to a string and print the start of it? Let's hope it's text!
		String content = new String(rawData);
		System.out.println(content.substring(0, Math.min(500, content.length())));
		System.out.println((content.length() > 500 ? "..." : ""));
		
		// Pretty printing to make the output more readable 
		System.out.println("=-=-=-=-=-=-=-=-=");
		if (i++ > 4) break; 
	}
}
 
示例6
@Override
public void map(Text key, ArchiveReader value, Context context) throws IOException {
	// Compile the regular expression once as it will be used continuously
	patternTag = Pattern.compile(HTML_TAG_PATTERN);
	
	for (ArchiveRecord r : value) {
		try {
			LOG.debug(r.getHeader().getUrl() + " -- " + r.available());
			// We're only interested in processing the responses, not requests or metadata
			if (r.getHeader().getMimetype().equals("application/http; msgtype=response")) {
				// Convenience function that reads the full message into a raw byte array
				byte[] rawData = IOUtils.toByteArray(r, r.available());
				String content = new String(rawData);
				// The HTTP header gives us valuable information about what was received during the request
				String headerText = content.substring(0, content.indexOf("\r\n\r\n"));
				
				// In our task, we're only interested in text/html, so we can be a little lax
				// TODO: Proper HTTP header parsing + don't trust headers
				if (headerText.contains("Content-Type: text/html")) {
					context.getCounter(MAPPERCOUNTER.RECORDS_IN).increment(1);
					// Only extract the body of the HTTP response when necessary
					// Due to the way strings work in Java, we don't use any more memory than before
					String body = content.substring(content.indexOf("\r\n\r\n") + 4);
					// Process all the matched HTML tags found in the body of the document
					matcherTag = patternTag.matcher(body);
					while (matcherTag.find()) {
						String tagName = matcherTag.group(1);
						outKey.set(tagName.toLowerCase());
						context.write(outKey, outVal);
					}
				}
			}
		}
		catch (Exception ex) {
			LOG.error("Caught Exception", ex);
			context.getCounter(MAPPERCOUNTER.EXCEPTIONS).increment(1);
		}
	}
}
 
示例7
@Override
public void map(Text key, ArchiveReader value, Context context) throws IOException {
	for (ArchiveRecord r : value) {
		try {
			if (r.getHeader().getMimetype().equals("text/plain")) {
				context.getCounter(MAPPERCOUNTER.RECORDS_IN).increment(1);
				LOG.debug(r.getHeader().getUrl() + " -- " + r.available());
				// Convenience function that reads the full message into a raw byte array
				byte[] rawData = IOUtils.toByteArray(r, r.available());
				String content = new String(rawData);
				// Grab each word from the document
				tokenizer = new StringTokenizer(content);
				if (!tokenizer.hasMoreTokens()) {
					context.getCounter(MAPPERCOUNTER.EMPTY_PAGE_TEXT).increment(1);
				} else {
					while (tokenizer.hasMoreTokens()) {
						outKey.set(tokenizer.nextToken());
						context.write(outKey, outVal);
					}
				}
			} else {
				context.getCounter(MAPPERCOUNTER.NON_PLAIN_TEXT).increment(1);
			}
		}
		catch (Exception ex) {
			LOG.error("Caught Exception", ex);
			context.getCounter(MAPPERCOUNTER.EXCEPTIONS).increment(1);
		}
	}
}
 
示例8
protected ArchiveReader getArchiveReader(final File f, final long offset)
   throws IOException {
	boolean compressed = testCompressedWARCFile(f);
	if (!compressed) {
		if (!FileUtils.isReadableWithExtensionAndMagic(f,
				DOT_WARC_FILE_EXTENSION, WARC_MAGIC)) {
			throw new IOException(f.getAbsolutePath()
					+ " is not a WARC file.");
		}
	}
	return (WARCReader)(compressed?
		WARCReaderFactory.factory.new CompressedWARCReader(f, offset):
		WARCReaderFactory.factory.new UncompressedWARCReader(f, offset));
}
 
示例9
protected ArchiveReader getArchiveReader(final String f,
		final InputStream is, final boolean atFirstRecord)
		throws IOException {
   	// Check if it's compressed, based on file extension.
   	if( f.endsWith(".gz") ) {
   		return new CompressedWARCReader(f, is, atFirstRecord);
   	} else {
   		return new UncompressedWARCReader(f, is);
   	}
}
 
示例10
public void testGetStringInputstreamBoolean() throws IOException {
	// Check the test files can be opened:
	for( String file : files ) {
		FileInputStream is = new FileInputStream(file);
		ArchiveReader ar = WARCReaderFactory.get(file, is, true);
		ArchiveRecord r = ar.get();
		String type = (String) r.getHeader().getHeaderValue(WARCConstants.HEADER_KEY_TYPE);
		// Check the first record comes out as a 'warcinfo' record.
		assertEquals(WARCRecordType.warcinfo.name(), type);
	}
}
 
示例11
public boolean checkIsCompressed() throws IOException {
	ArchiveReader reader = ArchiveReaderFactory.get(new File(baseDir, this.getName()));
	boolean result = reader.isCompressed();
	reader.close();
	return result;
}
 
示例12
@Override
public ArchiveReader getCurrentValue() throws IOException, InterruptedException {
	// We only ever have one value to give -- the output of the compressed file
	return ar;
}
 
示例13
@Override
public RecordReader<Text, ArchiveReader> createRecordReader(InputSplit split, TaskAttemptContext context)
		throws IOException, InterruptedException {
	return new WARCFileRecordReader();
}
 
示例14
protected ArchiveReader getArchiveReader(final File f, final long offset)
   throws IOException {
   	return getArchiveReader(f, true, offset);
}
 
示例15
public static ArchiveReader get(final String s, final InputStream is,
        final boolean atFirstRecord)
throws IOException {
    return ARCReaderFactory.factory.getArchiveReader(s, is,
        atFirstRecord);
}
 
示例16
@Override
public ArchiveReader getDeleteFileOnCloseReader(final File f) {
    throw new NotImplementedException("TODO");
}
 
示例17
public static ArchiveReader get(final String s, final InputStream is,
        final boolean atFirstRecord)
throws IOException {
    return WARCReaderFactory.factory.getArchiveReader(s, is,
        atFirstRecord);
}