From 79527082f99dab06446c5c2c77d19a24facd0531 Mon Sep 17 00:00:00 2001 From: jixiaoxu Date: Mon, 20 Mar 2023 15:34:05 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=B8=AD=E6=96=87=E5=88=86?= =?UTF-8?q?=E8=AF=8D=E6=8E=A5=E5=8F=A3=E7=BC=96=E8=AF=91=E9=97=AE=E9=A2=98?= =?UTF-8?q?=EF=BC=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- libsearch/index/file-content-indexer.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/libsearch/index/file-content-indexer.cpp b/libsearch/index/file-content-indexer.cpp index ee488a4..dc989b5 100644 --- a/libsearch/index/file-content-indexer.cpp +++ b/libsearch/index/file-content-indexer.cpp @@ -41,9 +41,7 @@ bool fileContentIndexer::index() } m_document.setData(content); - //'\xEF\xBC\x8C' is "," "\xE3\x80\x82" is "。" use three " " to replace ,to ensure the offset info. - content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " "); - std::vector term = ChineseSegmentation::getInstance()->callSegment(content.left(20480000).toStdString()); + std::vector term = ChineseSegmentation::getInstance()->callSegment(content); content.clear(); content.squeeze();