ukui-search/libchinese-segmentation/pinyin4cpp/pinyin4cpp_dataTrie.cpp

136 lines
4.7 KiB
C++

/*
* Copyright (C) 2022, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#include "pinyin4cpp_dataTrie.h"
Pinyin4cppDataTrie::Pinyin4cppDataTrie()
{
}
Pinyin4cppDataTrie::~Pinyin4cppDataTrie()
{
munmap(m_mmapAddr, m_mmapLength);
m_mmapAddr = nullptr;
close(m_mmapFd);
m_mmapFd = -1;
}
string Pinyin4cppDataTrie::Find(const string &key) const {
// darts-clone的接口方法
Darts::DoubleArray::result_pair_type find_result;
m_DoubleArrayDataTrie.exactMatchSearch(key.c_str(), find_result);
if ((0 == find_result.length) || (find_result.value < 0) || ((size_t)find_result.value >= m_elementsSize)) {//todo
return string();
}
return string(&m_elementsPtr[find_result.value]);
// cedarpp的接口方法
// int result = m_DoubleArrayDataTrie.exactMatchSearch<int>(key.c_str(), key.size());
// if (result < 0)
// return string();
// return string(&m_elementsPtr[result]);
}
bool Pinyin4cppDataTrie::InitBuildDat(map<string, string> &elements, const string &dat_cache_file, const string &md5) {
BuildDatCache(elements, dat_cache_file, md5);
return InitAttachDat(dat_cache_file, md5);
}
bool Pinyin4cppDataTrie::InitAttachDat(const string &dat_cache_file, const string &md5) {
m_mmapFd = open(dat_cache_file.c_str(), O_RDONLY);
if (m_mmapFd < 0) {
return false;
}
const auto seek_off = lseek(m_mmapFd, 0, SEEK_END);
assert(seek_off >= 0);
m_mmapLength = static_cast<size_t>(seek_off);
m_mmapAddr = reinterpret_cast<char *>(mmap(NULL, m_mmapLength, PROT_READ, MAP_SHARED, m_mmapFd, 0));
assert(MAP_FAILED != m_mmapAddr);
assert(m_mmapLength >= sizeof(CacheFileHeader));
CacheFileHeader & header = *reinterpret_cast<CacheFileHeader*>(m_mmapAddr);
m_elementsNum = header.elements_num;
m_elementsSize = header.elements_size;
assert(sizeof(header.md5_hex) == md5.size());
if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())) {
return false;
}
assert(m_mmapLength == sizeof(CacheFileHeader) + header.elements_size + header.dat_size * m_DoubleArrayDataTrie.unit_size());
m_elementsPtr = (const char *)(m_mmapAddr + sizeof(CacheFileHeader));
const char * dat_ptr = m_mmapAddr + sizeof(CacheFileHeader) + header.elements_size;
m_DoubleArrayDataTrie.set_array((char *)dat_ptr, header.dat_size);
return true;
}
void Pinyin4cppDataTrie::BuildDatCache(map<string, string> &elements, const string &dat_cache_file, const string &md5) {
vector<const char*> keys_ptr_vec;
vector<int> values_vec;
vector<string> mem_elem_vec;
keys_ptr_vec.reserve(elements.size());
values_vec.reserve(elements.size());
mem_elem_vec.reserve(elements.size());
CacheFileHeader header;
assert(sizeof(header.md5_hex) == md5.size());
memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
int offset(0);
for (auto &info:elements) {
keys_ptr_vec.push_back(info.first.c_str());
values_vec.push_back(offset);
offset += (info.second.size() + 1);//+1指字符串后加\0
assert(info.second.size() > 0);
mem_elem_vec.push_back(info.second);
}
auto const ret = m_DoubleArrayDataTrie.build(keys_ptr_vec.size(), &keys_ptr_vec[0], NULL, &values_vec[0]);
assert(0 == ret);
header.elements_num = mem_elem_vec.size();
header.elements_size = offset;
header.dat_size = m_DoubleArrayDataTrie.size();
string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
umask(S_IWGRP | S_IWOTH);
const int fd =mkstemp((char *)tmp_filepath.data());
assert(fd >= 0);
fchmod(fd, 0644);
auto write_bytes = write(fd, (const char *)&header, sizeof(header));
for (size_t i = 0; i < elements.size(); ++i) {
write_bytes += write(fd, mem_elem_vec[i].c_str(), mem_elem_vec[i].size() + 1);
}
write_bytes += write(fd, m_DoubleArrayDataTrie.array(), m_DoubleArrayDataTrie.total_size());
assert((size_t)write_bytes == sizeof(header) + offset + m_DoubleArrayDataTrie.total_size());
close(fd);
const auto rename_ret = rename(tmp_filepath.c_str(), dat_cache_file.c_str());
assert(0 == rename_ret);
}