ESQL: Avoid unintended attribute removal (#127563)

---------

Co-authored-by: Andrei Stefan <astefan@users.noreply.github.com>
This commit is contained in:
kanoshiou 2025-05-21 15:19:10 +08:00 committed by GitHub
parent 288f47eef6
commit 88b61e3621
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 157 additions and 6 deletions

View File

@ -0,0 +1,6 @@
pr: 127563
summary: "ESQL: Avoid unintended attribute removal"
area: ES|QL
type: bug
issues:
- 127468

View File

@ -51,7 +51,6 @@ public abstract class GenerativeRestTest extends ESRestTestCase {
"Unknown column \\[<all-fields-projected>\\]", // https://github.com/elastic/elasticsearch/issues/121741,
"Plan \\[ProjectExec\\[\\[<no-fields>.* optimized incorrectly due to missing references", // https://github.com/elastic/elasticsearch/issues/125866
"optimized incorrectly due to missing references", // https://github.com/elastic/elasticsearch/issues/116781
"only supports KEYWORD or TEXT values", // https://github.com/elastic/elasticsearch/issues/127468
"The incoming YAML document exceeds the limit:" // still to investigate, but it seems to be specific to the test framework
);

View File

@ -331,3 +331,43 @@ ROW a="b c d x"| DISSECT a "%{b} %{} %{d} %{}";
a:keyword | b:keyword | d:keyword
b c d x | b | d
;
avoidAttributesRemoval
// https://github.com/elastic/elasticsearch/issues/127468
required_capability: keep_regex_extract_attributes
required_capability: join_lookup_v12
from message_types
| eval type = 1
| lookup join message_types_lookup on message
| drop message
| dissect type "%{b}"
| stats x = max(b)
| keep x
;
x:keyword
Success
;
avoidAttributesRemoval2
// https://github.com/elastic/elasticsearch/issues/127468
required_capability: keep_regex_extract_attributes
required_capability: join_lookup_v12
FROM sample_data, employees
| EVAL client_ip = client_ip::keyword
| RENAME languages AS language_code
| LOOKUP JOIN clientips_lookup ON client_ip
| EVAL type = 1::keyword
| EVAL type = 2
| LOOKUP JOIN message_types_lookup ON message
| LOOKUP JOIN languages_lookup ON language_code
| DISSECT type "%{type_as_text}"
| KEEP message
| WHERE message IS NOT NULL
| SORT message DESC
| LIMIT 1
;
message:keyword
Disconnected
;

View File

@ -297,3 +297,35 @@ row text = "123 abc", int = 5 | sort int asc | grok text "%{NUMBER:text:int} %{W
text:integer | int:integer | description:keyword
123 | 5 | abc
;
avoidAttributesRemoval
// https://github.com/elastic/elasticsearch/issues/127468
required_capability: union_types
required_capability: join_lookup_v12
required_capability: keep_regex_extract_attributes
from multivalue_points,h*,messa*
| eval `card` = true, PbehoQUqKSF = "VLGjhcgNkQiEVyCLo", DsxMWtGL = true, qSxTIvUorMim = true, `location` = 8593178066470220111, type = -446161601, FSkGQkgmS = false
| eval PbehoQUqKSF = 753987034, HLNMQfQj = true, `within` = true, `id` = "JDKKkYwhhh", lk = null, aecuvjTkgZza = 510616700, aDAMpuVtNX = null, qCopgNZPt = "AjhJUtZefqKdJYH", BxHHlFoA = "isBrmhKLc"
| rename message as message
| lookup join message_types_lookup on message
| sort PbehoQUqKSF DESC, ip1 DESC NULLS LAST
| limit 5845
| drop `subset`, ip*, `card`, `within`, host.v*, description, `aecuvjTkgZza`, host.version, `ip0`, height_range, DsxMWtGL, host_group, `aDAMpuVtNX`, PbehoQUqKSF, `intersects`, `host.os`, aDAMpuVtNX, *ight_range, HLNMQfQj, `FSkGQkgmS`, BxHHlFoA, card
| grok type "%{WORD:GknCxQFo}"
| eval `location` = null, ZjWUUvGusyyz = null, HeeKIpzgh = false, `id` = 4325287503714500302, host = false, `lk` = null, HvTQdOqFajpH = false, fKNlsYoT = true, `location` = -1158449473, `qCopgNZPt` = 1219986202615280617
| drop HeeKIpzg*, `ZjWUUvGusyyz`, `message`, `type`, `lk`
| grok GknCxQFo "%{WORD:location} %{WORD:HvTQdOqFajpH}"
| drop HvTQdOqFajpH, `location`, centroid
| mv_expand GknCxQFo
| limit 410
| limit 3815
| rename `id` AS `GknCxQFo`
| grok host.name "%{WORD:oGQQZHxQHj} %{WORD:qCopgNZPt} %{WORD:vHKOmmocPcTO}"
| stats BkQXJRMeAM = min(GknCxQFo)
| keep `BkQXJRMeAM`
;
BkQXJRMeAM:long
4325287503714500302
;

View File

@ -1091,7 +1091,13 @@ public class EsqlCapabilities {
* During resolution (pre-analysis) we have to consider that joins can override regex extracted values
* see <a href="https://github.com/elastic/elasticsearch/issues/127467"> ES|QL: pruning of JOINs leads to missing fields #127467 </a>
*/
FIX_JOIN_MASKING_REGEX_EXTRACT;
FIX_JOIN_MASKING_REGEX_EXTRACT,
/**
* Avid GROK and DISSECT attributes being removed when resolving fields.
* see <a href="https://github.com/elastic/elasticsearch/issues/127468"> ES|QL: Grok only supports KEYWORD or TEXT values, found expression [type] type [INTEGER] #127468 </a>
*/
KEEP_REGEX_EXTRACT_ATTRIBUTES;
private final boolean enabled;

View File

@ -656,7 +656,7 @@ public class EsqlSession {
//
// and ips_policy enriches the results with the same name ip field),
// these aliases should be kept in the list of fields.
if (canRemoveAliases[0] && couldOverrideAliases(p)) {
if (canRemoveAliases[0] && p.anyMatch(EsqlSession::couldOverrideAliases)) {
canRemoveAliases[0] = false;
}
if (canRemoveAliases[0]) {
@ -726,7 +726,8 @@ public class EsqlSession {
|| p instanceof Project
|| p instanceof RegexExtract
|| p instanceof Rename
|| p instanceof TopN) == false;
|| p instanceof TopN
|| p instanceof UnresolvedRelation) == false;
}
private static boolean matchByName(Attribute attr, String other, boolean skipIfPattern) {

View File

@ -604,7 +604,20 @@ public class IndexResolverFieldNamesTests extends ESTestCase {
| eval language = concat(x, "-", lang)
| keep emp_no, x, lang, language
| sort emp_no desc | limit 3""",
Set.of("languages", "languages.*", "emp_no", "emp_no.*", "language_name", "language_name.*", "x", "x.*", "lang", "lang.*")
Set.of(
"emp_no",
"x",
"lang",
"language",
"language_name",
"languages",
"x.*",
"language_name.*",
"languages.*",
"emp_no.*",
"lang.*",
"language.*"
)
);
}
@ -1355,7 +1368,7 @@ public class IndexResolverFieldNamesTests extends ESTestCase {
| grok type "%{WORD:b}"
| stats x = max(b)
| keep x""", Set.of());
assertThat(fieldNames, equalTo(Set.of("message", "x", "x.*", "message.*")));
assertThat(fieldNames, equalTo(Set.of("x", "b", "type", "message", "x.*", "message.*", "type.*", "b.*")));
}
public void testAvoidGrokAttributesRemoval2() {
@ -1388,6 +1401,60 @@ public class IndexResolverFieldNamesTests extends ESTestCase {
}
/**
* @see <a href="https://github.com/elastic/elasticsearch/issues/127468">ES|QL: Grok only supports KEYWORD or TEXT values, found expression [type] type [INTEGER]</a>
*/
public void testAvoidGrokAttributesRemoval4() {
assumeTrue("LOOKUP JOIN available as snapshot only", EsqlCapabilities.Cap.JOIN_LOOKUP_V12.isEnabled());
Set<String> fieldNames = fieldNames("""
from message_types
| eval type = 1
| lookup join message_types_lookup on message
| drop message
| grok type "%{WORD:b}"
| stats x = max(b)
| keep x""", Set.of());
assertThat(fieldNames, equalTo(Set.of("x", "b", "type", "message", "x.*", "message.*", "type.*", "b.*")));
}
/**
* @see <a href="https://github.com/elastic/elasticsearch/issues/127468">ES|QL: Grok only supports KEYWORD or TEXT values, found expression [type] type [INTEGER]</a>
*/
public void testAvoidGrokAttributesRemoval5() {
assumeTrue("LOOKUP JOIN available as snapshot only", EsqlCapabilities.Cap.JOIN_LOOKUP_V12.isEnabled());
Set<String> fieldNames = fieldNames("""
FROM sample_data, employees
| EVAL client_ip = client_ip::keyword
| RENAME languages AS language_code
| LOOKUP JOIN clientips_lookup ON client_ip
| EVAL type = 1::keyword
| EVAL type = 2
| LOOKUP JOIN message_types_lookup ON message
| LOOKUP JOIN languages_lookup ON language_code
| DISSECT type "%{type_as_text}"
| KEEP message
| WHERE message IS NOT NULL
| SORT message DESC
| LIMIT 1""", Set.of());
assertThat(
fieldNames,
equalTo(
Set.of(
"message",
"type",
"languages",
"client_ip",
"language_code",
"language_code.*",
"client_ip.*",
"message.*",
"type.*",
"languages.*"
)
)
);
}
public void testEnrichOnDefaultField() {
Set<String> fieldNames = fieldNames("""
from employees