Java源码示例:org.apache.poi.hssf.extractor.ExcelExtractor

示例1
/**
 * Create the Extractor, if possible. Generally needs the Scratchpad jar.
 * Note that this won't check for embedded OOXML resources either, use
 *  {@link org.apache.poi.extractor.ExtractorFactory} for that.
 */
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException {
    // Look for certain entries in the stream, to figure it
    // out from
    for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) {
        if (poifsDir.hasEntry(workbookName)) {
            if (getPreferEventExtractor()) {
                return new EventBasedExcelExtractor(poifsDir);
            }
            return new ExcelExtractor(poifsDir);
        }
    }
    if (poifsDir.hasEntry(OLD_WORKBOOK_DIR_ENTRY_NAME)) {
        throw new OldExcelFormatException("Old Excel Spreadsheet format (1-95) "
                + "found. Please call OldExcelExtractor directly for basic text extraction");
    }
    
    // Ask Scratchpad, or fail trying
    Class<?> cls = getScratchpadClass();
    try {
        Method m = cls.getDeclaredMethod("createExtractor", DirectoryNode.class);
        POITextExtractor ext = (POITextExtractor)m.invoke(null, poifsDir);
        if (ext != null) return ext;
    } catch (IllegalArgumentException iae) {
        throw iae;
    } catch (Exception e) {
        throw new IllegalArgumentException("Error creating Scratchpad Extractor", e);
    }

    throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
}
 
示例2
private String microsoftExcelDocumentToString(InputStream inputStream) throws IOException, OpenXML4JException, XmlException {
    StringBuilder sb = new StringBuilder();

    try (InputStream excelStream = new BufferedInputStream(inputStream)) {
        if (POIFSFileSystem.hasPOIFSHeader(excelStream)) { // Before 2007 format files
            POIFSFileSystem excelFS = new POIFSFileSystem(excelStream);
            ExcelExtractor excelExtractor = new ExcelExtractor(excelFS);
            sb.append(excelExtractor.getText());
            excelExtractor.close();
        } else { // New format
            XSSFWorkbook workBook = new XSSFWorkbook(excelStream);
            int numberOfSheets = workBook.getNumberOfSheets();
            for (int i = 0; i < numberOfSheets; i++) {
                XSSFSheet sheet = workBook.getSheetAt(0);
                Iterator<Row> rowIterator = sheet.rowIterator();
                while (rowIterator.hasNext()) {
                    XSSFRow row = (XSSFRow) rowIterator.next();
                    Iterator<Cell> cellIterator = row.cellIterator();
                    while (cellIterator.hasNext()) {
                        XSSFCell cell = (XSSFCell) cellIterator.next();
                        sb.append(cell.toString());
                        sb.append(" ");
                    }
                    sb.append("\n");
                }
                sb.append("\n");
            }
        }
    }

    return sb.toString();
}
 
示例3
/**
 * 利用 POI 提供的工具,提取文件内容为字符串
 *
 * @param excelFile 待提取的 excel 文件
 * @return
 */
public String excelExtractor(File excelFile) {

    try {
        HSSFWorkbook wb = new HSSFWorkbook(new FileInputStream(excelFile));
        ExcelExtractor extractor = new ExcelExtractor(wb);
        extractor.setFormulasNotResults(true);
        extractor.setIncludeSheetNames(true);
        return extractor.getText();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        return null;
    }
}
 
示例4
/**
 * {@inheritDoc}
 */
public String extractText(InputStream stream, String type, String encoding) throws IOException {
	try {
		POIFSFileSystem fs = new POIFSFileSystem(stream);
		return new ExcelExtractor(fs).getText();
	} catch (RuntimeException e) {
		logger.warn("Failed to extract Excel text content", e);
		throw new IOException(e.getMessage(), e);
	} finally {
		stream.close();
	}
}
 
示例5
@Before
public void setup() {
    excelExtractor = Mockito.mock(ExcelExtractor.class);
    xssfExtractor = Mockito.mock(XSSFExcelExtractor.class);
    msExcelIndexer = new MSExcelIndexerWrapper(xssfExtractor, excelExtractor);
    file2Index = new AsyncIndexer.File2Index("".getBytes(),
            "", "", -1234, "");
}
 
示例6
/**
 * Extract metadata from Office Word
 */
public static OfficeMetadata officeExtractor(InputStream is, String mimeType) throws IOException {
	POIFSFileSystem fs = new POIFSFileSystem(is);
	OfficeMetadata md = new OfficeMetadata();
	SummaryInformation si = null;

	if (MimeTypeConfig.MIME_MS_WORD.equals(mimeType)) {
		si = new WordExtractor(fs).getSummaryInformation();
	} else if (MimeTypeConfig.MIME_MS_EXCEL.equals(mimeType)) {
		si = new ExcelExtractor(fs).getSummaryInformation();
	} else if (MimeTypeConfig.MIME_MS_POWERPOINT.equals(mimeType)) {
		si = new PowerPointExtractor(fs).getSummaryInformation();
	}

	if (si != null) {
		md.setTitle(si.getTitle());
		md.setSubject(si.getSubject());
		md.setAuthor(si.getAuthor());
		md.setLastAuthor(si.getLastAuthor());
		md.setKeywords(si.getKeywords());
		md.setComments(si.getComments());
		md.setTemplate(si.getTemplate());
		md.setRevNumber(si.getRevNumber());
		md.setApplicationName(si.getApplicationName());
		md.setEditTime(si.getEditTime());
		md.setPageCount(si.getPageCount());
		md.setWordCount(si.getWordCount());
		md.setCharCount(si.getCharCount());
		md.setSecurity(si.getSecurity());

		Calendar createDateTime = Calendar.getInstance();
		createDateTime.setTime(si.getCreateDateTime());
		md.setCreateDateTime(createDateTime);

		Calendar lastSaveDateTime = Calendar.getInstance();
		lastSaveDateTime.setTime(si.getLastSaveDateTime());
		md.setLastSaveDateTime(lastSaveDateTime);

		Calendar lastPrinted = Calendar.getInstance();
		lastPrinted.setTime(si.getLastPrinted());
		md.setLastPrinted(lastPrinted);
	}

	log.info("officeExtractor: {}", md);
	return md;
}
 
示例7
private void extractHeaderFooter(final StringBuilder buffy, final HeaderFooter hf) {
    final String content = ExcelExtractor._extractHeaderFooter(hf);
    if (content.length() > 0) {
        buffy.append(content).append(' ');
    }
}
 
示例8
protected ExcelExtractor getExcelExtractor(File2Index fileData) throws IOException {
	POIFSFileSystem fs = new POIFSFileSystem(new ByteArrayInputStream(fileData.data));
	return new ExcelExtractor(fs);
}
 
示例9
public MSExcelIndexerWrapper(XSSFExcelExtractor xssfExtractor, ExcelExtractor excelExtractor) {
    this.xssfExcelExtractor = xssfExtractor;
    this.excelExtractor = excelExtractor;
}
 
示例10
@Override
protected ExcelExtractor getExcelExtractor(AsyncIndexer.File2Index fileData) throws IOException {
    return this.excelExtractor;
}