Improving tika PDF handling (#101486)
Correctly handling PDF files with public key encryption in attachment processor
This commit is contained in:
parent
6d9d5469eb
commit
a59180459a
|
@ -0,0 +1,5 @@
|
|||
pr: 101486
|
||||
summary: Improving tika handling
|
||||
area: Ingest Node
|
||||
type: bug
|
||||
issues: []
|
|
@ -107,6 +107,15 @@ final class TikaImpl {
|
|||
} else {
|
||||
throw new AssertionError(cause);
|
||||
}
|
||||
} catch (LinkageError e) {
|
||||
if (e.getMessage().contains("bouncycastle")) {
|
||||
/*
|
||||
* Elasticsearch does not ship with bouncycastle. It is only used for public-key-encrypted PDFs, which this module does
|
||||
* not support anyway.
|
||||
*/
|
||||
throw new RuntimeException("document is encrypted", e);
|
||||
}
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -243,11 +243,25 @@ public class AttachmentProcessorTests extends ESTestCase {
|
|||
assertThat(attachmentData.get("content_length"), is(0L));
|
||||
}
|
||||
|
||||
public void testEncryptedPdf() throws Exception {
|
||||
public void testEncryptedWithPasswordPdf() throws Exception {
|
||||
/*
|
||||
* This tests that a PDF that has been encrypted with a password fails in the way expected
|
||||
*/
|
||||
ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class, () -> parseDocument("encrypted.pdf", processor));
|
||||
assertThat(e.getDetailedMessage(), containsString("document is encrypted"));
|
||||
}
|
||||
|
||||
public void testEncryptedWithKeyPdf() throws Exception {
|
||||
/*
|
||||
* This tests that a PDF that has been encrypted with a public key fails in the way expected
|
||||
*/
|
||||
ElasticsearchParseException e = expectThrows(
|
||||
ElasticsearchParseException.class,
|
||||
() -> parseDocument("encrypted-with-key.pdf", processor)
|
||||
);
|
||||
assertThat(e.getDetailedMessage(), containsString("document is encrypted"));
|
||||
}
|
||||
|
||||
public void testHtmlDocument() throws Exception {
|
||||
Map<String, Object> attachmentData = parseDocument("htmlWithEmptyDateMeta.html", processor);
|
||||
|
||||
|
|
Binary file not shown.
Loading…
Reference in New Issue