Optimize the logic of the function which is used to snippet chinese in order to adapt to the GB18030-2022 standard.

This commit is contained in:
JunjieBai 2023-07-27 13:59:21 +08:00 committed by iaom
parent f6c928c8e1
commit da7ae48cad
3 changed files with 212 additions and 216 deletions

View File

@ -41,6 +41,7 @@
#include <QClipboard>
#include <QQueue>
#include <QFontMetrics>
#include <QTextBoundaryFinder>
#include <quazip5/quazipfile.h>
#include <stdio.h>
#include <unistd.h>
@ -782,47 +783,192 @@ QString FileUtils::escapeHtml(const QString &str)
return temp;
}
QString FileUtils::chineseSubString(const std::string &myStr, uint start, uint length, const QString &keyword)
QString FileUtils::getSnippet(const std::string &myStr, uint start, const QString &keyword)
{
std::string afterSub = "";
QString sub = QString::fromStdString(myStr);
QFont boldFont(qApp->font().family());
boldFont.setPointSizeF(qApp->font().pointSizeF() + 2);
boldFont.setWeight(QFont::Bold);
QFontMetricsF boldMetricsF(boldFont);
if (length >= myStr.length()) {
afterSub = myStr.substr(start,length); //截取;
if (horizontalAdvanceContainsKeyword(QString::fromStdString(afterSub), keyword) >= 2*LABEL_MAX_WIDTH) {
sub = QString::fromStdString(afterSub);
}
return wrapData(sub, keyword);
}
uint strLength = 240;
bool elideLeft(false);
std::string sub = myStr.substr(start, strLength);
QString content = QString::fromStdString(sub);
//从关键字截length个字文本内容长度够截
if (start + length <= myStr.length()) {
afterSub = myStr.substr(start,length); //截取
sub = QString::fromStdString(afterSub); //转QString
//不够截往前补
if (start + strLength > myStr.length()) {
//新的起始位置
int newStart = myStr.length() - strLength;
if(start + length < myStr.length()){
sub.replace(sub.length() - 1, 1, ""); //最后一位可能为乱码,替换掉
}
sub = wrapData(sub, keyword);
} else {
uint newStart = myStr.length() - length; //从start截到末尾长度不够length更新截取位置到末尾前length个字的位置
afterSub = myStr.substr(newStart, length);
sub = QString::fromStdString(afterSub);
if (horizontalAdvanceContainsKeyword(QString::fromStdString(myStr.substr(newStart, start)), keyword) >= 2*LABEL_MAX_WIDTH) {
sub = wrapData(sub.replace(0, 1, ""), keyword, true);
if (myStr.length() < strLength) {
newStart = 0;
sub = myStr;
} else {
if (newStart + 3 < start) {
sub.replace(0, 1, "")/*.append("…")*/; //第一个字有可能乱码,直接替换
sub = myStr.substr(newStart, strLength);
}
if (horizontalAdvanceContainsKeyword(QString::fromStdString(myStr.substr(newStart, start)) + boldMetricsF.horizontalAdvance(keyword), keyword) > 2 * LABEL_MAX_WIDTH) {
if (horizontalAdvanceContainsKeyword(QString::fromStdString(myStr.substr(start)), keyword) <= 2 * LABEL_MAX_WIDTH) {
elideLeft = true;
} else {
afterSub = myStr.substr(start, length); //需要往前补三位以内说明补的全是乱码直接从start截就完了
sub = "" + QString::fromStdString(afterSub);
// sub.append("…");
sub = myStr.substr(start);
}
sub = wrapData(sub, keyword);
}
content = QString::fromStdString(sub);
}
QFont font(qApp->font().family());
font.setPointSizeF(qApp->font().pointSizeF());
QFontMetricsF fontMetricsF(font);
qreal blockLength = 0;
qreal total = 0;
int lineCount = 0;
int normalLength = 0;
int boldLength = 0;
QString snippet;
int boundaryStart = 0;
int boundaryEnd = 0;
QTextBoundaryFinder fm(QTextBoundaryFinder::Grapheme, content);
if (!elideLeft) {
for (;fm.position() != -1;fm.toNextBoundary()) {
boundaryEnd = fm.position();
QString word = content.mid(boundaryStart, boundaryEnd - boundaryStart);
if (boundaryStart == boundaryEnd) {
continue;
}
if (keyword.toUpper().contains(word.toUpper())) {
if (normalLength) {
total += fontMetricsF.horizontalAdvance(content.mid(boundaryStart - normalLength, normalLength));
normalLength = 0;
blockLength = 0;
}
boldLength += (boundaryEnd - boundaryStart);
blockLength = boldMetricsF.horizontalAdvance(content.mid(boundaryEnd - boldLength, boldLength));
} else {
if (boldLength) {
total += boldMetricsF.horizontalAdvance(content.mid(boundaryStart - boldLength, boldLength));
boldLength = 0;
blockLength = 0;
}
normalLength += (boundaryEnd - boundaryStart);
blockLength = fontMetricsF.horizontalAdvance(content.mid(boundaryEnd - normalLength, normalLength));
}
if (total + blockLength >= LABEL_MAX_WIDTH && lineCount == 0) {
if (total + blockLength > LABEL_MAX_WIDTH) {
fm.toPreviousBoundary();
snippet.append("\n");
} else {
snippet.append(word).append("\n");
boundaryStart = boundaryEnd;
}
normalLength = 0;
boldLength = 0;
lineCount++;
total = 0;
blockLength = 0;
continue;
} else if (total + blockLength >= LABEL_MAX_WIDTH && lineCount == 1) {
qreal distance = 0;
qreal wordSize = 0;
if (total + blockLength > LABEL_MAX_WIDTH) {
boundaryEnd = boundaryStart;
fm.toPreviousBoundary();
} else {
snippet.append(word);
}
while (wordSize < fontMetricsF.horizontalAdvance("")) {
boundaryStart = fm.position();
wordSize += keyword.toUpper().contains(content.mid(boundaryStart, boundaryEnd - boundaryStart).toUpper()) ?
boldMetricsF.horizontalAdvance(content.mid(boundaryStart, boundaryEnd - boundaryStart))
: fontMetricsF.horizontalAdvance(content.mid(boundaryStart, boundaryEnd - boundaryStart));
distance += (boundaryEnd - boundaryStart);
boundaryEnd = boundaryStart;
fm.toPreviousBoundary();
}
snippet = snippet.left(snippet.size() - distance);
snippet.append("");
break;
}
snippet.append(word);
boundaryStart = boundaryEnd;
}
} else {
boundaryEnd = content.size();
for (fm.toEnd(); fm.position() != -1; fm.toPreviousBoundary()) {
boundaryStart = fm.position();
if (boundaryEnd == boundaryStart) {
continue;
}
QString word = content.mid(boundaryStart, boundaryEnd - boundaryStart);
if (keyword.toUpper().contains(word.toUpper())) {
if (normalLength) {
total += fontMetricsF.horizontalAdvance(content.mid(boundaryEnd, normalLength));
normalLength = 0;
blockLength = 0;
}
boldLength += (boundaryEnd - boundaryStart);
blockLength = boldMetricsF.horizontalAdvance(content.mid(boundaryStart, boldLength));
} else {
if (boldLength) {
total += boldMetricsF.horizontalAdvance(content.mid(boundaryEnd, boldLength));
boldLength = 0;
blockLength = 0;
}
normalLength += (boundaryEnd - boundaryStart);
blockLength = fontMetricsF.horizontalAdvance(content.mid(boundaryStart, normalLength));
}
if (total + blockLength >= LABEL_MAX_WIDTH && lineCount == 0) {
if (total + blockLength > LABEL_MAX_WIDTH) {
fm.toNextBoundary();
snippet.prepend("\n");
} else {
snippet.prepend(word).prepend("\n");
boundaryStart = boundaryEnd;
}
normalLength = 0;
boldLength = 0;
lineCount++;
total = 0;
blockLength = 0;
continue;
} else if (total + blockLength >= LABEL_MAX_WIDTH && lineCount == 1) {
qreal distance = 0;
qreal wordSize = 0;
if (total + blockLength > LABEL_MAX_WIDTH) {
boundaryStart = boundaryEnd;
fm.toNextBoundary();
} else {
snippet.prepend(word);
}
while (wordSize < fontMetricsF.horizontalAdvance("")) {
boundaryEnd = fm.position();
QString firstLetter = content.mid(boundaryStart, boundaryEnd - boundaryStart);
wordSize += keyword.toUpper().contains(firstLetter.toUpper()) ?
boldMetricsF.horizontalAdvance(firstLetter) : fontMetricsF.horizontalAdvance(firstLetter);
distance += (boundaryEnd - boundaryStart);
boundaryStart = boundaryEnd;
fm.toNextBoundary();
}
snippet = snippet.right(snippet.size() - distance);
snippet.prepend("");
break;
}
snippet.prepend(word);
boundaryEnd = boundaryStart;
}
}
return sub;
return snippet;
}
QIcon FileUtils::iconFromTheme(const QString &name, const QIcon &iconDefault)
@ -932,21 +1078,30 @@ QString FileUtils::getHtmlText(const QString &text, const QString &keyword)
"}"
"</style>").arg(qApp->font().pointSizeF() + 2);
bool boldOpenned = false;
for(int i = 0; i < text.length(); i++) {
if((keyword.toUpper()).contains(QString(text.at(i)).toUpper())) {
QTextBoundaryFinder bf(QTextBoundaryFinder::Grapheme, text);
int start = 0;
for (;bf.position() != -1; bf.toNextBoundary()) {
int end = bf.position();
if (end == start) {
continue;
}
if (keyword.toUpper().contains(text.mid(start, end - start).toUpper())) {
if(! boldOpenned) {
boldOpenned = true;
htmlString.append(QString("<span>"));
}
htmlString.append(FileUtils::escapeHtml(QString(text.at(i))));
htmlString.append(FileUtils::escapeHtml(text.mid(start, end - start)));
} else {
if(boldOpenned) {
boldOpenned = false;
htmlString.append(QString("</span>"));
}
htmlString.append(FileUtils::escapeHtml(QString(text.at(i))));
htmlString.append(FileUtils::escapeHtml(text.mid(start, end - start)));
}
start = end;
}
htmlString.replace("\n", "<br />");//替换换行符
return "<pre>" + htmlString + "</pre>";
}
@ -986,176 +1141,6 @@ QString FileUtils::wrapData(QLabel *p_label, const QString &text)
}
}
}
// p_label->setText(wrapText);
return wrapText;
}
QString FileUtils::wrapData(const QString &text, const QString &keyword, bool elideLeft)
{
QString wrapText = text;
QFont boldFont(qApp->font().family());
boldFont.setPointSizeF(qApp->font().pointSizeF() + 2);
boldFont.setWeight(QFont::Bold);
QFontMetricsF boldMetricsF(boldFont);
QFont font(qApp->font().family());
font.setPointSizeF(qApp->font().pointSizeF());
QFontMetricsF fontMetricsF(font);
qreal blockLength = 0;
qreal total = 0;
int lineCount = 0;
int normalLength = 0;
int boldLength = 0;
if (elideLeft) {
for (int i = text.length() - 1; i >= 0; i--) {
if (keyword.toUpper().contains(text.at(i).toUpper())) {
if (normalLength) {
total += fontMetricsF.horizontalAdvance(text.mid(i + 1, normalLength));
normalLength = 0;
blockLength = 0;
}
if (boldLength) {
blockLength = boldMetricsF.horizontalAdvance(text.mid(i + 1, boldLength));
}
boldLength++;
} else {
if (boldLength) {
total += boldMetricsF.horizontalAdvance(text.mid(i + 1, boldLength));
boldLength = 0;
blockLength = 0;
}
if (normalLength) {
blockLength = fontMetricsF.horizontalAdvance(text.mid(i + 1, normalLength));
}
normalLength++;
}
if (!i) {
if (normalLength) {
blockLength = fontMetricsF.horizontalAdvance(text.left(normalLength));
}
if (boldLength) {
blockLength = boldMetricsF.horizontalAdvance(text.left(boldLength));
}
}
if (total + blockLength >= LABEL_MAX_WIDTH) {
i++;
if (total + blockLength > LABEL_MAX_WIDTH) {
if (normalLength) {
normalLength = 1;
} else {
boldLength = 1;
}
} else {
normalLength = 0;
boldLength = 0;
}
wrapText.insert(i + 1, '\n');
lineCount++;
total = 0;
blockLength = 0;
}
if (lineCount == 2) {
QString leftWord = text.left(i + 1);
if (!leftWord.isEmpty()) {
qreal distance = 2;//2是换行符加上要换第一个字
qreal wordSize = 0;
for (int index = i + 1; index < text.length(); index++) {
wordSize += keyword.toUpper().contains(text.at(index).toUpper()) ?
boldMetricsF.horizontalAdvance(text.at(index)) : fontMetricsF.horizontalAdvance(text.at(index));
if (wordSize < fontMetricsF.horizontalAdvance("")) {
distance++;//字长度比省略号要小,可能会挡上,所以再截一个字
} else {
break;
}
}
wrapText = wrapText.right(wrapText.size() - leftWord.size() - distance);
wrapText.prepend("");
}
break;
}
}
} else {
for (int i = 0; i <= text.length(); i++) {
if (i == text.length()) {
if (normalLength) {
blockLength = fontMetricsF.horizontalAdvance(text.right(normalLength));
}
if (boldLength) {
blockLength = boldMetricsF.horizontalAdvance(text.right(boldLength));
}
} else {
if (keyword.toUpper().contains(text.at(i).toUpper())) {
if (normalLength) {
total += fontMetricsF.horizontalAdvance(text.mid(i - normalLength, normalLength));
normalLength = 0;
blockLength = 0;
}
if (boldLength) {
blockLength = boldMetricsF.horizontalAdvance(text.mid(i - boldLength, boldLength));
}
boldLength++;
} else {
if (boldLength) {
total += boldMetricsF.horizontalAdvance(text.mid(i - boldLength, boldLength));
boldLength = 0;
blockLength = 0;
}
if (normalLength) {
blockLength = fontMetricsF.horizontalAdvance(text.mid(i - normalLength, normalLength));
}
normalLength++;
}
}
if (total + blockLength >= LABEL_MAX_WIDTH) {
i--;
if (total + blockLength > LABEL_MAX_WIDTH) {
wrapText.insert(i + lineCount, '\n');
if (normalLength) {
normalLength = 1;
} else {
boldLength = 1;
}
} else {
wrapText.insert(i + 1 + lineCount, '\n');
normalLength = 0;
boldLength = 0;
}
lineCount++;
total = 0;
blockLength = 0;
}
if (lineCount == 2) {
QString leftWord = text.mid(i);
if (!leftWord.isEmpty()) {
qreal distance = 2;
qreal wordSize = 0;
for (int index = i; index > 0; index--) {
wordSize += keyword.toUpper().contains(text.at(index).toUpper()) ?
boldMetricsF.horizontalAdvance(text.at(index)) : fontMetricsF.horizontalAdvance(text.at(index));
if (wordSize < fontMetricsF.horizontalAdvance("")) {
distance++;
} else {
break;
}
}
wrapText = wrapText.left(wrapText.size() - leftWord.size() - distance);
wrapText.append("");
}
break;
}
}
}
return wrapText;
}
@ -1169,25 +1154,37 @@ qreal FileUtils::horizontalAdvanceContainsKeyword(const QString &content, const
QFont font(qApp->font().family());
font.setPointSizeF(qApp->font().pointSizeF());
QFontMetricsF fontMetricsF(font);
QTextBoundaryFinder fm(QTextBoundaryFinder::Grapheme, content);
int start = 0;
qreal contentSize = 0;
int boldLength = 0;
int normalLength = 0;
for (int i = 0; i < content.length(); i++) {
if (keyword.toUpper().contains(content.at(i).toUpper())) {
boldLength++;
for (;fm.position() != -1;fm.toNextBoundary()) {
int end = fm.position();
if (end == start) {
continue;
}
QString letter = content.mid(start, end - start);
if (keyword.toUpper().contains(letter.toUpper())) {
if (normalLength) {
contentSize += boldMetricsF.horizontalAdvance(content.mid(i - normalLength, normalLength));
contentSize += fontMetricsF.horizontalAdvance(content.mid(start - normalLength, normalLength));
normalLength = 0;
}
boldLength += (end - start);
} else {
normalLength++;
if (boldLength) {
contentSize += boldMetricsF.horizontalAdvance(content.mid(i - boldLength, boldLength));
contentSize += boldMetricsF.horizontalAdvance(content.mid(start - boldLength, boldLength));
boldLength = 0;
}
normalLength += (end - start);
}
start = end;
}
if (boldLength) {
contentSize += boldMetricsF.horizontalAdvance(content.right(boldLength));
}

View File

@ -34,7 +34,6 @@ public:
static QString getHtmlText(const QString &text, const QString &keyword);
static QString setAllTextBold(const QString &name);
static QString wrapData(QLabel *p_label, const QString &text);
static QString wrapData(const QString &text, const QString &keyword, bool elideLeft = false);
static qreal horizontalAdvanceContainsKeyword(const QString &content, const QString &keyword);
static std::string makeDocUterm(QString path);
static QIcon getFileIcon(const QString &uri, bool checkValid = true);
@ -64,7 +63,7 @@ public:
static int openFile(QString &path, bool openInDir = false);
static bool copyPath(QString &path);
static QString escapeHtml(const QString &str);
static QString chineseSubString(const std::string &myStr, uint start, uint length, const QString &keyword);
static QString getSnippet(const std::string &myStr, uint start, const QString &keyword);
static QIcon iconFromTheme(const QString &name, const QIcon &iconDefault);
static bool isOpenXMLFileEncrypted(const QString &path);
/**

View File

@ -355,7 +355,7 @@ int FileContentSearch::getResult(Xapian::MSet &result, std::string &keyWord) {
}
}
auto pos = termIterator.positionlist_begin();
QString snippet = FileUtils::chineseSubString(data, *pos, 120, QString::fromStdString(keyWord).remove(" "));
QString snippet = FileUtils::getSnippet(data, *pos, QString::fromStdString(keyWord).remove(" "));
ri.description.prepend(SearchPluginIface::DescriptionInfo{"",FileUtils::getHtmlText(snippet, QString::fromStdString(keyWord).remove(" "))});
QString().swap(snippet);
@ -472,7 +472,7 @@ int OcrSearch::getResult(Xapian::MSet &result, std::string &keyWord) {
term.skip_to(wordTobeFound);
//fix me: make a snippet without cut cjk char.
auto pos = term.positionlist_begin();
QString snippet = FileUtils::chineseSubString(data, *pos, 120, QString::fromStdString(keyWord).remove(" "));
QString snippet = FileUtils::getSnippet(data, *pos, QString::fromStdString(keyWord).remove(" "));
ri.description.prepend(SearchPluginIface::DescriptionInfo{"", FileUtils::getHtmlText(snippet, QString::fromStdString(keyWord).remove(" "))});
QString().swap(snippet);