Java源码示例:org.archive.io.ArchiveReader
示例1
protected ArchiveReader getArchiveReader(final File arcFile,
final boolean skipSuffixTest, final long offset)
throws IOException {
boolean compressed = testCompressedARCFile(arcFile, skipSuffixTest);
if (!compressed) {
if (!FileUtils.isReadableWithExtensionAndMagic(arcFile,
ARC_FILE_EXTENSION, ARC_MAGIC_NUMBER)) {
throw new IOException(arcFile.getAbsolutePath() +
" is not an Internet Archive ARC file.");
}
}
return compressed?
(ARCReader)ARCReaderFactory.factory.
new CompressedARCReader(arcFile, offset):
(ARCReader)ARCReaderFactory.factory.
new UncompressedARCReader(arcFile, offset);
}
示例2
private void offsetResourceTest( File testfile, long offset, String uri ) throws Exception {
RandomAccessFile raf = new RandomAccessFile(testfile, "r");
raf.seek(offset);
InputStream is = new FileInputStream(raf.getFD());
String fPath = testfile.getAbsolutePath();
ArchiveReader reader = ARCReaderFactory.get(fPath, is, false);
// This one works:
//ArchiveReader reader = ARCReaderFactory.get(testfile, offset);
ArchiveRecord record = reader.get();
final String url = record.getHeader().getUrl();
assertEquals("URL of record is not as expected.", uri, url);
final long position = record.getPosition();
final long recordLength = record.getHeader().getLength();
assertTrue("Position " + position + " is after end of record " + recordLength, position <= recordLength);
// Clean up:
if( raf != null )
raf.close();
}
示例3
/**
* Create and return the index of the ArcHarvestFile.
* @param baseDir the base directory of the arcs
* @throws IOException thrown if there is an error
* @throws ParseException
*/
public Map<String, HarvestResourceDTO> index(File baseDir) throws IOException, ParseException {
Map<String, HarvestResourceDTO> results = new HashMap<String, HarvestResourceDTO>();
File theArchiveFile = new File(baseDir, this.getName());
ArchiveReader reader = ArchiveReaderFactory.get(theArchiveFile);
this.compressed = reader.isCompressed();
Iterator<ArchiveRecord> it = reader.iterator();
while(it.hasNext()) {
ArchiveRecord rec = it.next();
if(rec instanceof WARCRecord) {
String type = rec.getHeader().getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
if(type.equals(WARCConstants.RESPONSE)) {
String mime = rec.getHeader().getMimetype();
if(!mime.equals("text/dns")) {
indexWARCResponse(rec, results);
}
}
}
else {
indexARCRecord(rec, results);
}
}
reader.close();
return results;
}
示例4
public static void main(String[] args) throws IOException, S3ServiceException {
// We're accessing a publicly available bucket so don't need to fill in our credentials
S3Service s3s = new RestS3Service(null);
// Let's grab a file out of the CommonCrawl S3 bucket
String fn = "common-crawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/warc/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
S3Object f = s3s.getObject("aws-publicdatasets", fn, null, null, null, null, null, null);
// The file name identifies the ArchiveReader and indicates if it should be decompressed
ArchiveReader ar = WARCReaderFactory.get(fn, f.getDataInputStream(), true);
// Once we have an ArchiveReader, we can work through each of the records it contains
int i = 0;
for(ArchiveRecord r : ar) {
// The header file contains information such as the type of record, size, creation time, and URL
System.out.println("Header: " + r.getHeader());
System.out.println("URL: " + r.getHeader().getUrl());
System.out.println();
// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
// Create a byte array that is as long as all the record's stated length
byte[] rawData = new byte[r.available()];
r.read(rawData);
// Note: potential optimization would be to have a large buffer only allocated once
// Why don't we convert it to a string and print the start of it? Let's hope it's text!
String content = new String(rawData);
System.out.println(content.substring(0, Math.min(500, content.length())));
System.out.println((content.length() > 500 ? "..." : ""));
// Pretty printing to make the output more readable
System.out.println("=-=-=-=-=-=-=-=-=");
if (i++ > 4) break;
}
}
示例5
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
// Set up a local compressed WARC file for reading
String fn = "data/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
FileInputStream is = new FileInputStream(fn);
// The file name identifies the ArchiveReader and indicates if it should be decompressed
ArchiveReader ar = WARCReaderFactory.get(fn, is, true);
// Once we have an ArchiveReader, we can work through each of the records it contains
int i = 0;
for(ArchiveRecord r : ar) {
// The header file contains information such as the type of record, size, creation time, and URL
System.out.println(r.getHeader());
System.out.println(r.getHeader().getUrl());
System.out.println();
// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
// Create a byte array that is as long as the record's stated length
byte[] rawData = IOUtils.toByteArray(r, r.available());
// Why don't we convert it to a string and print the start of it? Let's hope it's text!
String content = new String(rawData);
System.out.println(content.substring(0, Math.min(500, content.length())));
System.out.println((content.length() > 500 ? "..." : ""));
// Pretty printing to make the output more readable
System.out.println("=-=-=-=-=-=-=-=-=");
if (i++ > 4) break;
}
}
示例6
@Override
public void map(Text key, ArchiveReader value, Context context) throws IOException {
// Compile the regular expression once as it will be used continuously
patternTag = Pattern.compile(HTML_TAG_PATTERN);
for (ArchiveRecord r : value) {
try {
LOG.debug(r.getHeader().getUrl() + " -- " + r.available());
// We're only interested in processing the responses, not requests or metadata
if (r.getHeader().getMimetype().equals("application/http; msgtype=response")) {
// Convenience function that reads the full message into a raw byte array
byte[] rawData = IOUtils.toByteArray(r, r.available());
String content = new String(rawData);
// The HTTP header gives us valuable information about what was received during the request
String headerText = content.substring(0, content.indexOf("\r\n\r\n"));
// In our task, we're only interested in text/html, so we can be a little lax
// TODO: Proper HTTP header parsing + don't trust headers
if (headerText.contains("Content-Type: text/html")) {
context.getCounter(MAPPERCOUNTER.RECORDS_IN).increment(1);
// Only extract the body of the HTTP response when necessary
// Due to the way strings work in Java, we don't use any more memory than before
String body = content.substring(content.indexOf("\r\n\r\n") + 4);
// Process all the matched HTML tags found in the body of the document
matcherTag = patternTag.matcher(body);
while (matcherTag.find()) {
String tagName = matcherTag.group(1);
outKey.set(tagName.toLowerCase());
context.write(outKey, outVal);
}
}
}
}
catch (Exception ex) {
LOG.error("Caught Exception", ex);
context.getCounter(MAPPERCOUNTER.EXCEPTIONS).increment(1);
}
}
}
示例7
@Override
public void map(Text key, ArchiveReader value, Context context) throws IOException {
for (ArchiveRecord r : value) {
try {
if (r.getHeader().getMimetype().equals("text/plain")) {
context.getCounter(MAPPERCOUNTER.RECORDS_IN).increment(1);
LOG.debug(r.getHeader().getUrl() + " -- " + r.available());
// Convenience function that reads the full message into a raw byte array
byte[] rawData = IOUtils.toByteArray(r, r.available());
String content = new String(rawData);
// Grab each word from the document
tokenizer = new StringTokenizer(content);
if (!tokenizer.hasMoreTokens()) {
context.getCounter(MAPPERCOUNTER.EMPTY_PAGE_TEXT).increment(1);
} else {
while (tokenizer.hasMoreTokens()) {
outKey.set(tokenizer.nextToken());
context.write(outKey, outVal);
}
}
} else {
context.getCounter(MAPPERCOUNTER.NON_PLAIN_TEXT).increment(1);
}
}
catch (Exception ex) {
LOG.error("Caught Exception", ex);
context.getCounter(MAPPERCOUNTER.EXCEPTIONS).increment(1);
}
}
}
示例8
protected ArchiveReader getArchiveReader(final File f, final long offset)
throws IOException {
boolean compressed = testCompressedWARCFile(f);
if (!compressed) {
if (!FileUtils.isReadableWithExtensionAndMagic(f,
DOT_WARC_FILE_EXTENSION, WARC_MAGIC)) {
throw new IOException(f.getAbsolutePath()
+ " is not a WARC file.");
}
}
return (WARCReader)(compressed?
WARCReaderFactory.factory.new CompressedWARCReader(f, offset):
WARCReaderFactory.factory.new UncompressedWARCReader(f, offset));
}
示例9
protected ArchiveReader getArchiveReader(final String f,
final InputStream is, final boolean atFirstRecord)
throws IOException {
// Check if it's compressed, based on file extension.
if( f.endsWith(".gz") ) {
return new CompressedWARCReader(f, is, atFirstRecord);
} else {
return new UncompressedWARCReader(f, is);
}
}
示例10
public void testGetStringInputstreamBoolean() throws IOException {
// Check the test files can be opened:
for( String file : files ) {
FileInputStream is = new FileInputStream(file);
ArchiveReader ar = WARCReaderFactory.get(file, is, true);
ArchiveRecord r = ar.get();
String type = (String) r.getHeader().getHeaderValue(WARCConstants.HEADER_KEY_TYPE);
// Check the first record comes out as a 'warcinfo' record.
assertEquals(WARCRecordType.warcinfo.name(), type);
}
}
示例11
public boolean checkIsCompressed() throws IOException {
ArchiveReader reader = ArchiveReaderFactory.get(new File(baseDir, this.getName()));
boolean result = reader.isCompressed();
reader.close();
return result;
}
示例12
@Override
public ArchiveReader getCurrentValue() throws IOException, InterruptedException {
// We only ever have one value to give -- the output of the compressed file
return ar;
}
示例13
@Override
public RecordReader<Text, ArchiveReader> createRecordReader(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
return new WARCFileRecordReader();
}
示例14
protected ArchiveReader getArchiveReader(final File f, final long offset)
throws IOException {
return getArchiveReader(f, true, offset);
}
示例15
public static ArchiveReader get(final String s, final InputStream is,
final boolean atFirstRecord)
throws IOException {
return ARCReaderFactory.factory.getArchiveReader(s, is,
atFirstRecord);
}
示例16
@Override
public ArchiveReader getDeleteFileOnCloseReader(final File f) {
throw new NotImplementedException("TODO");
}
示例17
public static ArchiveReader get(final String s, final InputStream is,
final boolean atFirstRecord)
throws IOException {
return WARCReaderFactory.factory.getArchiveReader(s, is,
atFirstRecord);
}