Extract more standard metadata from binary files (#78754)
Until now, we have been extracted a few number of fields from the binary files sent to the ingest attachment plugin: * `content`, * `title`, * `author`, * `keywords`, * `date`, * `content_type`, * `content_length`, * `language`. Tika has a list of more standard properties which can be extracted: * `modified`, * `format`, * `identifier`, * `contributor`, * `coverage`, * `modifier`, * `creator_tool`, * `publisher`, * `relation`, * `rights`, * `source`, * `type`, * `description`, * `print_date`, * `metadata_date`, * `latitude`, * `longitude`, * `altitude`, * `rating`, * `comments` This commit exposes those new fields. Related to #22339. Co-authored-by: Keith Massey <keith.massey@elastic.co>
This commit is contained in:
parent
480fd6aa08
commit
564ff9db88
|
@ -98,6 +98,40 @@ The document's `attachment` object contains extracted properties for the file:
|
|||
NOTE: Keeping the binary as a field within the document might consume a lot of resources. It is highly recommended
|
||||
to remove that field from the document. Set `remove_binary` to `true` to automatically remove the field.
|
||||
|
||||
[[ingest-attachment-fields]]
|
||||
==== Exported fields
|
||||
|
||||
The fields which might be extracted from a document are:
|
||||
|
||||
* `content`,
|
||||
* `title`,
|
||||
* `author`,
|
||||
* `keywords`,
|
||||
* `date`,
|
||||
* `content_type`,
|
||||
* `content_length`,
|
||||
* `language`,
|
||||
* `modified`,
|
||||
* `format`,
|
||||
* `identifier`,
|
||||
* `contributor`,
|
||||
* `coverage`,
|
||||
* `modifier`,
|
||||
* `creator_tool`,
|
||||
* `publisher`,
|
||||
* `relation`,
|
||||
* `rights`,
|
||||
* `source`,
|
||||
* `type`,
|
||||
* `description`,
|
||||
* `print_date`,
|
||||
* `metadata_date`,
|
||||
* `latitude`,
|
||||
* `longitude`,
|
||||
* `altitude`,
|
||||
* `rating`,
|
||||
* `comments`
|
||||
|
||||
To extract only certain `attachment` fields, specify the `properties` array:
|
||||
|
||||
[source,console]
|
||||
|
|
|
@ -86,6 +86,12 @@ tasks.named("forbiddenPatterns").configure {
|
|||
exclude '**/text-cjk-*.txt'
|
||||
}
|
||||
|
||||
tasks.named("yamlRestTestV7CompatTransform").configure { task ->
|
||||
// 2 new tika metadata fields are returned in v8
|
||||
task.replaceValueInLength("_source.attachment", 8, "Test ingest attachment processor with .doc file")
|
||||
task.replaceValueInLength("_source.attachment", 8, "Test ingest attachment processor with .docx file")
|
||||
}
|
||||
|
||||
tasks.named("thirdPartyAudit").configure {
|
||||
ignoreMissingClasses()
|
||||
}
|
||||
|
|
|
@ -11,6 +11,7 @@ package org.elasticsearch.ingest.attachment;
|
|||
import org.apache.tika.exception.ZeroByteFileException;
|
||||
import org.apache.tika.language.LanguageIdentifier;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.Office;
|
||||
import org.apache.tika.metadata.TikaCoreProperties;
|
||||
import org.elasticsearch.ElasticsearchParseException;
|
||||
import org.elasticsearch.common.Strings;
|
||||
|
@ -132,40 +133,11 @@ public final class AttachmentProcessor extends AbstractProcessor {
|
|||
additionalFields.put(Property.LANGUAGE.toLowerCase(), language);
|
||||
}
|
||||
|
||||
if (properties.contains(Property.DATE)) {
|
||||
String createdDate = metadata.get(TikaCoreProperties.CREATED);
|
||||
if (createdDate != null) {
|
||||
additionalFields.put(Property.DATE.toLowerCase(), createdDate);
|
||||
}
|
||||
}
|
||||
|
||||
if (properties.contains(Property.TITLE)) {
|
||||
String title = metadata.get(TikaCoreProperties.TITLE);
|
||||
if (Strings.hasLength(title)) {
|
||||
additionalFields.put(Property.TITLE.toLowerCase(), title);
|
||||
}
|
||||
}
|
||||
|
||||
if (properties.contains(Property.AUTHOR)) {
|
||||
String author = metadata.get("Author");
|
||||
if (Strings.hasLength(author)) {
|
||||
additionalFields.put(Property.AUTHOR.toLowerCase(), author);
|
||||
}
|
||||
}
|
||||
|
||||
if (properties.contains(Property.KEYWORDS)) {
|
||||
String keywords = metadata.get("Keywords");
|
||||
if (Strings.hasLength(keywords)) {
|
||||
additionalFields.put(Property.KEYWORDS.toLowerCase(), keywords);
|
||||
}
|
||||
}
|
||||
|
||||
if (properties.contains(Property.CONTENT_TYPE)) {
|
||||
String contentType = metadata.get(Metadata.CONTENT_TYPE);
|
||||
if (Strings.hasLength(contentType)) {
|
||||
additionalFields.put(Property.CONTENT_TYPE.toLowerCase(), contentType);
|
||||
}
|
||||
}
|
||||
addAdditionalField(additionalFields, Property.DATE, metadata.get(TikaCoreProperties.CREATED));
|
||||
addAdditionalField(additionalFields, Property.TITLE, metadata.get(TikaCoreProperties.TITLE));
|
||||
addAdditionalField(additionalFields, Property.AUTHOR, metadata.get("Author"));
|
||||
addAdditionalField(additionalFields, Property.KEYWORDS, metadata.get("Keywords"));
|
||||
addAdditionalField(additionalFields, Property.CONTENT_TYPE, metadata.get(Metadata.CONTENT_TYPE));
|
||||
|
||||
if (properties.contains(Property.CONTENT_LENGTH)) {
|
||||
String contentLength = metadata.get(Metadata.CONTENT_LENGTH);
|
||||
|
@ -178,6 +150,30 @@ public final class AttachmentProcessor extends AbstractProcessor {
|
|||
additionalFields.put(Property.CONTENT_LENGTH.toLowerCase(), length);
|
||||
}
|
||||
|
||||
addAdditionalField(additionalFields, Property.AUTHOR, metadata.get(TikaCoreProperties.CREATOR));
|
||||
addAdditionalField(additionalFields, Property.KEYWORDS, metadata.get(Office.KEYWORDS));
|
||||
|
||||
addAdditionalField(additionalFields, Property.MODIFIED, metadata.get(TikaCoreProperties.MODIFIED));
|
||||
addAdditionalField(additionalFields, Property.FORMAT, metadata.get(TikaCoreProperties.FORMAT));
|
||||
addAdditionalField(additionalFields, Property.IDENTIFIER, metadata.get(TikaCoreProperties.IDENTIFIER));
|
||||
addAdditionalField(additionalFields, Property.CONTRIBUTOR, metadata.get(TikaCoreProperties.CONTRIBUTOR));
|
||||
addAdditionalField(additionalFields, Property.COVERAGE, metadata.get(TikaCoreProperties.COVERAGE));
|
||||
addAdditionalField(additionalFields, Property.MODIFIER, metadata.get(TikaCoreProperties.MODIFIER));
|
||||
addAdditionalField(additionalFields, Property.CREATOR_TOOL, metadata.get(TikaCoreProperties.CREATOR_TOOL));
|
||||
addAdditionalField(additionalFields, Property.PUBLISHER, metadata.get(TikaCoreProperties.PUBLISHER));
|
||||
addAdditionalField(additionalFields, Property.RELATION, metadata.get(TikaCoreProperties.RELATION));
|
||||
addAdditionalField(additionalFields, Property.RIGHTS, metadata.get(TikaCoreProperties.RIGHTS));
|
||||
addAdditionalField(additionalFields, Property.SOURCE, metadata.get(TikaCoreProperties.SOURCE));
|
||||
addAdditionalField(additionalFields, Property.TYPE, metadata.get(TikaCoreProperties.TYPE));
|
||||
addAdditionalField(additionalFields, Property.DESCRIPTION, metadata.get(TikaCoreProperties.DESCRIPTION));
|
||||
addAdditionalField(additionalFields, Property.PRINT_DATE, metadata.get(TikaCoreProperties.PRINT_DATE));
|
||||
addAdditionalField(additionalFields, Property.METADATA_DATE, metadata.get(TikaCoreProperties.METADATA_DATE));
|
||||
addAdditionalField(additionalFields, Property.LATITUDE, metadata.get(TikaCoreProperties.LATITUDE));
|
||||
addAdditionalField(additionalFields, Property.LONGITUDE, metadata.get(TikaCoreProperties.LONGITUDE));
|
||||
addAdditionalField(additionalFields, Property.ALTITUDE, metadata.get(TikaCoreProperties.ALTITUDE));
|
||||
addAdditionalField(additionalFields, Property.RATING, metadata.get(TikaCoreProperties.RATING));
|
||||
addAdditionalField(additionalFields, Property.COMMENTS, metadata.get(TikaCoreProperties.COMMENTS));
|
||||
|
||||
ingestDocument.setFieldValue(targetField, additionalFields);
|
||||
|
||||
if (removeBinary) {
|
||||
|
@ -186,6 +182,18 @@ public final class AttachmentProcessor extends AbstractProcessor {
|
|||
return ingestDocument;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an additional field if not null or empty
|
||||
* @param additionalFields additional fields
|
||||
* @param property property to add
|
||||
* @param value value to add
|
||||
*/
|
||||
private <T> void addAdditionalField(Map<String, Object> additionalFields, Property property, String value) {
|
||||
if (properties.contains(property) && Strings.hasLength(value)) {
|
||||
additionalFields.put(property.toLowerCase(), value);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getType() {
|
||||
return TYPE;
|
||||
|
@ -270,7 +278,27 @@ public final class AttachmentProcessor extends AbstractProcessor {
|
|||
DATE,
|
||||
CONTENT_TYPE,
|
||||
CONTENT_LENGTH,
|
||||
LANGUAGE;
|
||||
LANGUAGE,
|
||||
MODIFIED,
|
||||
FORMAT,
|
||||
IDENTIFIER,
|
||||
CONTRIBUTOR,
|
||||
COVERAGE,
|
||||
MODIFIER,
|
||||
CREATOR_TOOL,
|
||||
PUBLISHER,
|
||||
RELATION,
|
||||
RIGHTS,
|
||||
SOURCE,
|
||||
TYPE,
|
||||
DESCRIPTION,
|
||||
PRINT_DATE,
|
||||
METADATA_DATE,
|
||||
LATITUDE,
|
||||
LONGITUDE,
|
||||
ALTITUDE,
|
||||
RATING,
|
||||
COMMENTS;
|
||||
|
||||
public static Property parse(String value) {
|
||||
return valueOf(value.toUpperCase(Locale.ROOT));
|
||||
|
|
|
@ -68,9 +68,20 @@ public class AttachmentProcessorTests extends ESTestCase {
|
|||
}
|
||||
|
||||
public void testHtmlDocumentWithRandomFields() throws Exception {
|
||||
// date is not present in the html doc
|
||||
// some metadata are not present in the html doc
|
||||
// "date", "metadata_date", "comments", "modified", "modifier", "print_date", "relation", "creator_tool", "altitude"
|
||||
// "identifier", "longitude", "publisher", "description", "latitude", "format", "source", "coverage"
|
||||
// "rating", "type", "contributor", "rights"
|
||||
// we are only trying with content, title, author, keywords, content_type and content_length.
|
||||
ArrayList<AttachmentProcessor.Property> fieldsList = new ArrayList<>(
|
||||
EnumSet.complementOf(EnumSet.of(AttachmentProcessor.Property.DATE))
|
||||
EnumSet.of(
|
||||
AttachmentProcessor.Property.CONTENT,
|
||||
AttachmentProcessor.Property.TITLE,
|
||||
AttachmentProcessor.Property.AUTHOR,
|
||||
AttachmentProcessor.Property.KEYWORDS,
|
||||
AttachmentProcessor.Property.CONTENT_TYPE,
|
||||
AttachmentProcessor.Property.CONTENT_LENGTH
|
||||
)
|
||||
);
|
||||
Set<AttachmentProcessor.Property> selectedProperties = new HashSet<>();
|
||||
|
||||
|
@ -128,7 +139,20 @@ public class AttachmentProcessorTests extends ESTestCase {
|
|||
public void testWordDocument() throws Exception {
|
||||
Map<String, Object> attachmentData = parseDocument("issue-104.docx", processor);
|
||||
|
||||
assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length"));
|
||||
assertThat(
|
||||
attachmentData.keySet(),
|
||||
containsInAnyOrder(
|
||||
"content",
|
||||
"language",
|
||||
"date",
|
||||
"author",
|
||||
"content_type",
|
||||
"content_length",
|
||||
"modifier",
|
||||
"modified",
|
||||
"publisher"
|
||||
)
|
||||
);
|
||||
assertThat(attachmentData.get("content"), is(notNullValue()));
|
||||
assertThat(attachmentData.get("language"), is("en"));
|
||||
assertThat(attachmentData.get("date"), is("2012-10-12T11:17:00Z"));
|
||||
|
@ -138,12 +162,28 @@ public class AttachmentProcessorTests extends ESTestCase {
|
|||
attachmentData.get("content_type").toString(),
|
||||
is("application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|
||||
);
|
||||
assertThat(attachmentData.get("modifier").toString(), is("Luka Lampret"));
|
||||
assertThat(attachmentData.get("modified").toString(), is("2015-02-20T11:36:00Z"));
|
||||
assertThat(attachmentData.get("publisher").toString(), is("JDI"));
|
||||
}
|
||||
|
||||
public void testWordDocumentWithVisioSchema() throws Exception {
|
||||
Map<String, Object> attachmentData = parseDocument("issue-22077.docx", processor);
|
||||
|
||||
assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length"));
|
||||
assertThat(
|
||||
attachmentData.keySet(),
|
||||
containsInAnyOrder(
|
||||
"content",
|
||||
"language",
|
||||
"date",
|
||||
"author",
|
||||
"content_type",
|
||||
"content_length",
|
||||
"modifier",
|
||||
"modified",
|
||||
"print_date"
|
||||
)
|
||||
);
|
||||
assertThat(attachmentData.get("content").toString(), containsString("Table of Contents"));
|
||||
assertThat(attachmentData.get("language"), is("en"));
|
||||
assertThat(attachmentData.get("date"), is("2015-01-06T18:07:00Z"));
|
||||
|
@ -153,18 +193,37 @@ public class AttachmentProcessorTests extends ESTestCase {
|
|||
attachmentData.get("content_type").toString(),
|
||||
is("application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|
||||
);
|
||||
assertThat(attachmentData.get("modifier").toString(), is("Chris Dufour"));
|
||||
assertThat(attachmentData.get("modified").toString(), is("2016-12-04T16:58:00Z"));
|
||||
assertThat(attachmentData.get("print_date").toString(), is("2015-01-05T19:12:00Z"));
|
||||
}
|
||||
|
||||
public void testLegacyWordDocumentWithVisioSchema() throws Exception {
|
||||
Map<String, Object> attachmentData = parseDocument("issue-22077.doc", processor);
|
||||
|
||||
assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length"));
|
||||
assertThat(
|
||||
attachmentData.keySet(),
|
||||
containsInAnyOrder(
|
||||
"content",
|
||||
"language",
|
||||
"date",
|
||||
"author",
|
||||
"content_type",
|
||||
"content_length",
|
||||
"modifier",
|
||||
"modified",
|
||||
"print_date"
|
||||
)
|
||||
);
|
||||
assertThat(attachmentData.get("content").toString(), containsString("Table of Contents"));
|
||||
assertThat(attachmentData.get("language"), is("en"));
|
||||
assertThat(attachmentData.get("date"), is("2016-12-16T15:04:00Z"));
|
||||
assertThat(attachmentData.get("author"), is(notNullValue()));
|
||||
assertThat(attachmentData.get("content_length"), is(notNullValue()));
|
||||
assertThat(attachmentData.get("content_type").toString(), is("application/msword"));
|
||||
assertThat(attachmentData.get("modifier").toString(), is("David Pilato"));
|
||||
assertThat(attachmentData.get("modified").toString(), is("2016-12-16T15:04:00Z"));
|
||||
assertThat(attachmentData.get("print_date").toString(), is("2015-01-05T19:12:00Z"));
|
||||
}
|
||||
|
||||
public void testPdf() throws Exception {
|
||||
|
@ -217,9 +276,26 @@ public class AttachmentProcessorTests extends ESTestCase {
|
|||
|
||||
assertThat(
|
||||
attachmentData.keySet(),
|
||||
containsInAnyOrder("language", "content", "author", "title", "content_type", "content_length", "date", "keywords")
|
||||
containsInAnyOrder(
|
||||
"language",
|
||||
"content",
|
||||
"author",
|
||||
"title",
|
||||
"content_type",
|
||||
"content_length",
|
||||
"date",
|
||||
"keywords",
|
||||
"identifier",
|
||||
"contributor",
|
||||
"publisher",
|
||||
"description"
|
||||
)
|
||||
);
|
||||
assertThat(attachmentData.get("content_type").toString(), containsString("application/epub+zip"));
|
||||
assertThat(attachmentData.get("identifier").toString(), is("1234567890"));
|
||||
assertThat(attachmentData.get("contributor").toString(), is("no-one"));
|
||||
assertThat(attachmentData.get("publisher").toString(), is("Apache"));
|
||||
assertThat(attachmentData.get("description").toString(), is("This is an ePub test publication for Tika."));
|
||||
}
|
||||
|
||||
// no real detection, just rudimentary
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
---
|
||||
"Test ingest attachment processor with .doc file":
|
||||
- skip:
|
||||
version: " - 7.99.99"
|
||||
reason: "new fields added in 8.0.0"
|
||||
- do:
|
||||
ingest.put_pipeline:
|
||||
id: "my_pipeline"
|
||||
|
@ -27,17 +30,22 @@
|
|||
get:
|
||||
index: test
|
||||
id: 1
|
||||
- length: { _source.attachment: 6 }
|
||||
- length: { _source.attachment: 8 }
|
||||
- match: { _source.attachment.content: "Test elasticsearch" }
|
||||
- match: { _source.attachment.language: "et" }
|
||||
- match: { _source.attachment.author: "David Pilato" }
|
||||
- match: { _source.attachment.date: "2016-03-10T08:25:00Z" }
|
||||
- match: { _source.attachment.content_length: 19 }
|
||||
- match: { _source.attachment.content_type: "application/msword" }
|
||||
- match: { _source.attachment.modifier: "David Pilato" }
|
||||
- match: { _source.attachment.modified: "2016-03-10T08:25:00Z" }
|
||||
|
||||
|
||||
---
|
||||
"Test ingest attachment processor with .docx file":
|
||||
- skip:
|
||||
version: " - 7.99.99"
|
||||
reason: "new fields added in 8.0.0"
|
||||
- do:
|
||||
ingest.put_pipeline:
|
||||
id: "my_pipeline"
|
||||
|
@ -65,10 +73,12 @@
|
|||
get:
|
||||
index: test
|
||||
id: 1
|
||||
- length: { _source.attachment: 6 }
|
||||
- length: { _source.attachment: 8 }
|
||||
- match: { _source.attachment.content: "Test elasticsearch" }
|
||||
- match: { _source.attachment.language: "et" }
|
||||
- match: { _source.attachment.author: "David Pilato" }
|
||||
- match: { _source.attachment.date: "2016-03-10T08:24:00Z" }
|
||||
- match: { _source.attachment.content_length: 19 }
|
||||
- match: { _source.attachment.content_type: "application/vnd.openxmlformats-officedocument.wordprocessingml.document" }
|
||||
- match: { _source.attachment.modifier: "David Pilato" }
|
||||
- match: { _source.attachment.modified: "2016-03-10T08:24:00Z" }
|
||||
|
|
Loading…
Reference in New Issue