From c85a4a5d277100c8d3c29aa2077823ce18359098 Mon Sep 17 00:00:00 2001 From: iaom <18504285112@163.com> Date: Mon, 31 May 2021 14:56:59 +0800 Subject: [PATCH] [FIX]: Offset info error in keyword extraction. --- libsearch/index/construct-document.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libsearch/index/construct-document.cpp b/libsearch/index/construct-document.cpp index f1751a9..e54ff84 100644 --- a/libsearch/index/construct-document.cpp +++ b/libsearch/index/construct-document.cpp @@ -116,7 +116,8 @@ void ConstructDocumentForContent::run() { doc.addTerm(upTerm); doc.addValue(m_path); - content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " "); + //'\xEF\xBC\x8C' is "," "\xE3\x80\x82" is "。" use three " " to replace ,to ensure the offset info. + content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " "); // QVector term = ChineseSegmentation::getInstance()->callSegement(content.left(20480000)); //修改函数返回类型,修改入参为std::string引用--jxx20210519