Improving tika PDF handling (#101486)

Correctly handling PDF files with public key encryption in attachment processor
This commit is contained in:
Keith Massey 2023-10-30 07:41:18 -05:00 committed by GitHub
parent 6d9d5469eb
commit a59180459a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 29 additions and 1 deletions

View File

@ -0,0 +1,5 @@
pr: 101486
summary: Improving tika handling
area: Ingest Node
type: bug
issues: []

View File

@ -107,6 +107,15 @@ final class TikaImpl {
} else {
throw new AssertionError(cause);
}
} catch (LinkageError e) {
if (e.getMessage().contains("bouncycastle")) {
/*
* Elasticsearch does not ship with bouncycastle. It is only used for public-key-encrypted PDFs, which this module does
* not support anyway.
*/
throw new RuntimeException("document is encrypted", e);
}
throw new RuntimeException(e);
}
}

View File

@ -243,11 +243,25 @@ public class AttachmentProcessorTests extends ESTestCase {
assertThat(attachmentData.get("content_length"), is(0L));
}
public void testEncryptedPdf() throws Exception {
public void testEncryptedWithPasswordPdf() throws Exception {
/*
* This tests that a PDF that has been encrypted with a password fails in the way expected
*/
ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class, () -> parseDocument("encrypted.pdf", processor));
assertThat(e.getDetailedMessage(), containsString("document is encrypted"));
}
public void testEncryptedWithKeyPdf() throws Exception {
/*
* This tests that a PDF that has been encrypted with a public key fails in the way expected
*/
ElasticsearchParseException e = expectThrows(
ElasticsearchParseException.class,
() -> parseDocument("encrypted-with-key.pdf", processor)
);
assertThat(e.getDetailedMessage(), containsString("document is encrypted"));
}
public void testHtmlDocument() throws Exception {
Map<String, Object> attachmentData = parseDocument("htmlWithEmptyDateMeta.html", processor);