/* * Copyright (C) 2020, KylinSoft Co., Ltd. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . * * Authors: zhangpengfei * */ #include "construct-document.h" #include "file-utils.h" #include "chinese-segmentation.h" #include #include #include //extern QList *g_docListForPath; //extern QMutex g_mutexDocListForPath; using namespace UkuiSearch; ConstructDocumentForPath::ConstructDocumentForPath(QVector list) { this->setAutoDelete(true); m_list = std::move(list); } void ConstructDocumentForPath::run() { // qDebug()<<"ConstructDocumentForPath"; // if(!UkuiSearch::g_docListForPath) // UkuiSearch::g_docListForPath = new QVector; // qDebug()<size(); QString index_text = m_list.at(0).toLower(); QString sourcePath = m_list.at(1); Document doc; //多音字版 //现加入首字母 QStringList pinyin_text_list = FileUtils::findMultiToneWords(QString(m_list.at(0)).replace(".", "")); // if(!pinyin_text_list.isEmpty()) // { // for (QString& i : pinyin_text_list){ // i.replace("", " "); // i = i.simplified(); // } // doc.setIndexText(pinyin_text_list); // } QString uniqueterm = QString::fromStdString(FileUtils::makeDocUterm(sourcePath)); QString upTerm = QString::fromStdString("ZEEKERUPTERM" + FileUtils::makeDocUterm(sourcePath.section("/", 0, -2, QString::SectionIncludeLeadingSep))); // qDebug()<<"sourcePath"<setAutoDelete(true); m_path = std::move(path); } void ConstructDocumentForContent::run() { // qDebug() << "ConstructDocumentForContent currentThreadId()" << QThread::currentThreadId(); //构造文本索引的document QString content; FileReader::getTextContent(m_path, content); Document doc; doc.setUniqueTerm(FileUtils::makeDocUterm(m_path)); doc.addTerm("ZEEKERUPTERM" + FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep))); doc.addValue(1, m_path); if(content.isEmpty()) { doc.reuireDeleted(); } else { doc.setData(content); //'\xEF\xBC\x8C' is "," "\xE3\x80\x82" is "。" use three " " to replace ,to ensure the offset info. content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " "); std::vector term = ChineseSegmentation::getInstance()->callSegementStd(content.left(20480000).toStdString()); for(size_t i = 0; i < term.size(); ++i) { doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast(term.at(i).weight)); } term.clear(); term.shrink_to_fit(); } IndexGenerator::g_mutexDocListForContent.lock(); IndexGenerator::g_docListForContent.append(doc); IndexGenerator::g_mutexDocListForContent.unlock(); content.clear(); content.squeeze(); return; } ConstructDocumentForOcr::ConstructDocumentForOcr(QString path) { this->setAutoDelete(true); m_path = std::move(path); } void ConstructDocumentForOcr::run() { QString content; FileReader::getTextContent(m_path, content); Document doc; doc.setUniqueTerm(FileUtils::makeDocUterm(m_path)); doc.addTerm("ZEEKERUPTERM" + FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep))); doc.addValue(1, m_path); if(content.isEmpty()) { doc.reuireDeleted(); } else { doc.setData(content); //'\xEF\xBC\x8C' is "," "\xE3\x80\x82" is "。" use three " " to replace ,to ensure the offset info. content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " "); std::vector term = ChineseSegmentation::getInstance()->callSegementStd(content.toStdString()); for(size_t i = 0; i < term.size(); ++i) { doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast(term.at(i).weight)); } term.clear(); term.shrink_to_fit(); } IndexGenerator::g_mutexDocListForOcr.lock(); IndexGenerator::g_docListForOcr.append(doc); IndexGenerator::g_mutexDocListForOcr.unlock(); content.clear(); content.squeeze(); }