Java源码示例:org.apache.poi.hssf.extractor.ExcelExtractor
示例1
/**
* Create the Extractor, if possible. Generally needs the Scratchpad jar.
* Note that this won't check for embedded OOXML resources either, use
* {@link org.apache.poi.extractor.ExtractorFactory} for that.
*/
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException {
// Look for certain entries in the stream, to figure it
// out from
for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) {
if (poifsDir.hasEntry(workbookName)) {
if (getPreferEventExtractor()) {
return new EventBasedExcelExtractor(poifsDir);
}
return new ExcelExtractor(poifsDir);
}
}
if (poifsDir.hasEntry(OLD_WORKBOOK_DIR_ENTRY_NAME)) {
throw new OldExcelFormatException("Old Excel Spreadsheet format (1-95) "
+ "found. Please call OldExcelExtractor directly for basic text extraction");
}
// Ask Scratchpad, or fail trying
Class<?> cls = getScratchpadClass();
try {
Method m = cls.getDeclaredMethod("createExtractor", DirectoryNode.class);
POITextExtractor ext = (POITextExtractor)m.invoke(null, poifsDir);
if (ext != null) return ext;
} catch (IllegalArgumentException iae) {
throw iae;
} catch (Exception e) {
throw new IllegalArgumentException("Error creating Scratchpad Extractor", e);
}
throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
}
示例2
private String microsoftExcelDocumentToString(InputStream inputStream) throws IOException, OpenXML4JException, XmlException {
StringBuilder sb = new StringBuilder();
try (InputStream excelStream = new BufferedInputStream(inputStream)) {
if (POIFSFileSystem.hasPOIFSHeader(excelStream)) { // Before 2007 format files
POIFSFileSystem excelFS = new POIFSFileSystem(excelStream);
ExcelExtractor excelExtractor = new ExcelExtractor(excelFS);
sb.append(excelExtractor.getText());
excelExtractor.close();
} else { // New format
XSSFWorkbook workBook = new XSSFWorkbook(excelStream);
int numberOfSheets = workBook.getNumberOfSheets();
for (int i = 0; i < numberOfSheets; i++) {
XSSFSheet sheet = workBook.getSheetAt(0);
Iterator<Row> rowIterator = sheet.rowIterator();
while (rowIterator.hasNext()) {
XSSFRow row = (XSSFRow) rowIterator.next();
Iterator<Cell> cellIterator = row.cellIterator();
while (cellIterator.hasNext()) {
XSSFCell cell = (XSSFCell) cellIterator.next();
sb.append(cell.toString());
sb.append(" ");
}
sb.append("\n");
}
sb.append("\n");
}
}
}
return sb.toString();
}
示例3
/**
* 利用 POI 提供的工具,提取文件内容为字符串
*
* @param excelFile 待提取的 excel 文件
* @return
*/
public String excelExtractor(File excelFile) {
try {
HSSFWorkbook wb = new HSSFWorkbook(new FileInputStream(excelFile));
ExcelExtractor extractor = new ExcelExtractor(wb);
extractor.setFormulasNotResults(true);
extractor.setIncludeSheetNames(true);
return extractor.getText();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return null;
}
}
示例4
/**
* {@inheritDoc}
*/
public String extractText(InputStream stream, String type, String encoding) throws IOException {
try {
POIFSFileSystem fs = new POIFSFileSystem(stream);
return new ExcelExtractor(fs).getText();
} catch (RuntimeException e) {
logger.warn("Failed to extract Excel text content", e);
throw new IOException(e.getMessage(), e);
} finally {
stream.close();
}
}
示例5
@Before
public void setup() {
excelExtractor = Mockito.mock(ExcelExtractor.class);
xssfExtractor = Mockito.mock(XSSFExcelExtractor.class);
msExcelIndexer = new MSExcelIndexerWrapper(xssfExtractor, excelExtractor);
file2Index = new AsyncIndexer.File2Index("".getBytes(),
"", "", -1234, "");
}
示例6
/**
* Extract metadata from Office Word
*/
public static OfficeMetadata officeExtractor(InputStream is, String mimeType) throws IOException {
POIFSFileSystem fs = new POIFSFileSystem(is);
OfficeMetadata md = new OfficeMetadata();
SummaryInformation si = null;
if (MimeTypeConfig.MIME_MS_WORD.equals(mimeType)) {
si = new WordExtractor(fs).getSummaryInformation();
} else if (MimeTypeConfig.MIME_MS_EXCEL.equals(mimeType)) {
si = new ExcelExtractor(fs).getSummaryInformation();
} else if (MimeTypeConfig.MIME_MS_POWERPOINT.equals(mimeType)) {
si = new PowerPointExtractor(fs).getSummaryInformation();
}
if (si != null) {
md.setTitle(si.getTitle());
md.setSubject(si.getSubject());
md.setAuthor(si.getAuthor());
md.setLastAuthor(si.getLastAuthor());
md.setKeywords(si.getKeywords());
md.setComments(si.getComments());
md.setTemplate(si.getTemplate());
md.setRevNumber(si.getRevNumber());
md.setApplicationName(si.getApplicationName());
md.setEditTime(si.getEditTime());
md.setPageCount(si.getPageCount());
md.setWordCount(si.getWordCount());
md.setCharCount(si.getCharCount());
md.setSecurity(si.getSecurity());
Calendar createDateTime = Calendar.getInstance();
createDateTime.setTime(si.getCreateDateTime());
md.setCreateDateTime(createDateTime);
Calendar lastSaveDateTime = Calendar.getInstance();
lastSaveDateTime.setTime(si.getLastSaveDateTime());
md.setLastSaveDateTime(lastSaveDateTime);
Calendar lastPrinted = Calendar.getInstance();
lastPrinted.setTime(si.getLastPrinted());
md.setLastPrinted(lastPrinted);
}
log.info("officeExtractor: {}", md);
return md;
}
示例7
private void extractHeaderFooter(final StringBuilder buffy, final HeaderFooter hf) {
final String content = ExcelExtractor._extractHeaderFooter(hf);
if (content.length() > 0) {
buffy.append(content).append(' ');
}
}
示例8
protected ExcelExtractor getExcelExtractor(File2Index fileData) throws IOException {
POIFSFileSystem fs = new POIFSFileSystem(new ByteArrayInputStream(fileData.data));
return new ExcelExtractor(fs);
}
示例9
public MSExcelIndexerWrapper(XSSFExcelExtractor xssfExtractor, ExcelExtractor excelExtractor) {
this.xssfExcelExtractor = xssfExtractor;
this.excelExtractor = excelExtractor;
}
示例10
@Override
protected ExcelExtractor getExcelExtractor(AsyncIndexer.File2Index fileData) throws IOException {
return this.excelExtractor;
}