Optimize the logic of the function which is used to snippet chinese in order to adapt to the GB18030-2022 standard.

This commit is contained in:
JunjieBai 2023-07-27 13:59:21 +08:00
parent 637e71e277
commit d6ef08f769
3 changed files with 212 additions and 216 deletions

View File

@ -41,6 +41,7 @@
#include <QClipboard> #include <QClipboard>
#include <QQueue> #include <QQueue>
#include <QFontMetrics> #include <QFontMetrics>
#include <QTextBoundaryFinder>
#include <quazip5/quazipfile.h> #include <quazip5/quazipfile.h>
#include <stdio.h> #include <stdio.h>
#include <unistd.h> #include <unistd.h>
@ -782,47 +783,192 @@ QString FileUtils::escapeHtml(const QString &str)
return temp; return temp;
} }
QString FileUtils::chineseSubString(const std::string &myStr, uint start, uint length, const QString &keyword) QString FileUtils::getSnippet(const std::string &myStr, uint start, const QString &keyword)
{ {
std::string afterSub = ""; QFont boldFont(qApp->font().family());
QString sub = QString::fromStdString(myStr); boldFont.setPointSizeF(qApp->font().pointSizeF() + 2);
boldFont.setWeight(QFont::Bold);
QFontMetricsF boldMetricsF(boldFont);
if (length >= myStr.length()) { uint strLength = 240;
afterSub = myStr.substr(start,length); //截取; bool elideLeft(false);
if (horizontalAdvanceContainsKeyword(QString::fromStdString(afterSub), keyword) >= 2*LABEL_MAX_WIDTH) { std::string sub = myStr.substr(start, strLength);
sub = QString::fromStdString(afterSub); QString content = QString::fromStdString(sub);
}
return wrapData(sub, keyword);
}
//从关键字截length个字文本内容长度够截 //不够截往前补
if (start + length <= myStr.length()) { if (start + strLength > myStr.length()) {
afterSub = myStr.substr(start,length); //截取 //新的起始位置
sub = QString::fromStdString(afterSub); //转QString int newStart = myStr.length() - strLength;
if(start + length < myStr.length()){ if (myStr.length() < strLength) {
sub.replace(sub.length() - 1, 1, ""); //最后一位可能为乱码,替换掉 newStart = 0;
} sub = myStr;
sub = wrapData(sub, keyword);
} else { } else {
uint newStart = myStr.length() - length; //从start截到末尾长度不够length更新截取位置到末尾前length个字的位置 sub = myStr.substr(newStart, strLength);
afterSub = myStr.substr(newStart, length); }
sub = QString::fromStdString(afterSub);
if (horizontalAdvanceContainsKeyword(QString::fromStdString(myStr.substr(newStart, start)), keyword) >= 2*LABEL_MAX_WIDTH) { if (horizontalAdvanceContainsKeyword(QString::fromStdString(myStr.substr(newStart, start)) + boldMetricsF.horizontalAdvance(keyword), keyword) > 2 * LABEL_MAX_WIDTH) {
sub = wrapData(sub.replace(0, 1, ""), keyword, true); if (horizontalAdvanceContainsKeyword(QString::fromStdString(myStr.substr(start)), keyword) <= 2 * LABEL_MAX_WIDTH) {
elideLeft = true;
} else { } else {
if (newStart + 3 < start) { sub = myStr.substr(start);
sub.replace(0, 1, "")/*.append("…")*/; //第一个字有可能乱码,直接替换 }
}
content = QString::fromStdString(sub);
}
QFont font(qApp->font().family());
font.setPointSizeF(qApp->font().pointSizeF());
QFontMetricsF fontMetricsF(font);
qreal blockLength = 0;
qreal total = 0;
int lineCount = 0;
int normalLength = 0;
int boldLength = 0;
QString snippet;
int boundaryStart = 0;
int boundaryEnd = 0;
QTextBoundaryFinder fm(QTextBoundaryFinder::Grapheme, content);
if (!elideLeft) {
for (;fm.position() != -1;fm.toNextBoundary()) {
boundaryEnd = fm.position();
QString word = content.mid(boundaryStart, boundaryEnd - boundaryStart);
if (boundaryStart == boundaryEnd) {
continue;
}
if (keyword.toUpper().contains(word.toUpper())) {
if (normalLength) {
total += fontMetricsF.horizontalAdvance(content.mid(boundaryStart - normalLength, normalLength));
normalLength = 0;
blockLength = 0;
}
boldLength += (boundaryEnd - boundaryStart);
blockLength = boldMetricsF.horizontalAdvance(content.mid(boundaryEnd - boldLength, boldLength));
} else { } else {
afterSub = myStr.substr(start, length); //需要往前补三位以内说明补的全是乱码直接从start截就完了 if (boldLength) {
sub = "" + QString::fromStdString(afterSub); total += boldMetricsF.horizontalAdvance(content.mid(boundaryStart - boldLength, boldLength));
// sub.append("…"); boldLength = 0;
blockLength = 0;
} }
sub = wrapData(sub, keyword); normalLength += (boundaryEnd - boundaryStart);
blockLength = fontMetricsF.horizontalAdvance(content.mid(boundaryEnd - normalLength, normalLength));
}
if (total + blockLength >= LABEL_MAX_WIDTH && lineCount == 0) {
if (total + blockLength > LABEL_MAX_WIDTH) {
fm.toPreviousBoundary();
snippet.append("\n");
} else {
snippet.append(word).append("\n");
boundaryStart = boundaryEnd;
}
normalLength = 0;
boldLength = 0;
lineCount++;
total = 0;
blockLength = 0;
continue;
} else if (total + blockLength >= LABEL_MAX_WIDTH && lineCount == 1) {
qreal distance = 0;
qreal wordSize = 0;
if (total + blockLength > LABEL_MAX_WIDTH) {
boundaryEnd = boundaryStart;
fm.toPreviousBoundary();
} else {
snippet.append(word);
}
while (wordSize < fontMetricsF.horizontalAdvance("")) {
boundaryStart = fm.position();
wordSize += keyword.toUpper().contains(content.mid(boundaryStart, boundaryEnd - boundaryStart).toUpper()) ?
boldMetricsF.horizontalAdvance(content.mid(boundaryStart, boundaryEnd - boundaryStart))
: fontMetricsF.horizontalAdvance(content.mid(boundaryStart, boundaryEnd - boundaryStart));
distance += (boundaryEnd - boundaryStart);
boundaryEnd = boundaryStart;
fm.toPreviousBoundary();
}
snippet = snippet.left(snippet.size() - distance);
snippet.append("");
break;
}
snippet.append(word);
boundaryStart = boundaryEnd;
}
} else {
boundaryEnd = content.size();
for (fm.toEnd(); fm.position() != -1; fm.toPreviousBoundary()) {
boundaryStart = fm.position();
if (boundaryEnd == boundaryStart) {
continue;
}
QString word = content.mid(boundaryStart, boundaryEnd - boundaryStart);
if (keyword.toUpper().contains(word.toUpper())) {
if (normalLength) {
total += fontMetricsF.horizontalAdvance(content.mid(boundaryEnd, normalLength));
normalLength = 0;
blockLength = 0;
}
boldLength += (boundaryEnd - boundaryStart);
blockLength = boldMetricsF.horizontalAdvance(content.mid(boundaryStart, boldLength));
} else {
if (boldLength) {
total += boldMetricsF.horizontalAdvance(content.mid(boundaryEnd, boldLength));
boldLength = 0;
blockLength = 0;
}
normalLength += (boundaryEnd - boundaryStart);
blockLength = fontMetricsF.horizontalAdvance(content.mid(boundaryStart, normalLength));
}
if (total + blockLength >= LABEL_MAX_WIDTH && lineCount == 0) {
if (total + blockLength > LABEL_MAX_WIDTH) {
fm.toNextBoundary();
snippet.prepend("\n");
} else {
snippet.prepend(word).prepend("\n");
boundaryStart = boundaryEnd;
}
normalLength = 0;
boldLength = 0;
lineCount++;
total = 0;
blockLength = 0;
continue;
} else if (total + blockLength >= LABEL_MAX_WIDTH && lineCount == 1) {
qreal distance = 0;
qreal wordSize = 0;
if (total + blockLength > LABEL_MAX_WIDTH) {
boundaryStart = boundaryEnd;
fm.toNextBoundary();
} else {
snippet.prepend(word);
}
while (wordSize < fontMetricsF.horizontalAdvance("")) {
boundaryEnd = fm.position();
QString firstLetter = content.mid(boundaryStart, boundaryEnd - boundaryStart);
wordSize += keyword.toUpper().contains(firstLetter.toUpper()) ?
boldMetricsF.horizontalAdvance(firstLetter) : fontMetricsF.horizontalAdvance(firstLetter);
distance += (boundaryEnd - boundaryStart);
boundaryStart = boundaryEnd;
fm.toNextBoundary();
}
snippet = snippet.right(snippet.size() - distance);
snippet.prepend("");
break;
}
snippet.prepend(word);
boundaryEnd = boundaryStart;
} }
} }
return sub;
return snippet;
} }
QIcon FileUtils::iconFromTheme(const QString &name, const QIcon &iconDefault) QIcon FileUtils::iconFromTheme(const QString &name, const QIcon &iconDefault)
@ -932,21 +1078,30 @@ QString FileUtils::getHtmlText(const QString &text, const QString &keyword)
"}" "}"
"</style>").arg(qApp->font().pointSizeF() + 2); "</style>").arg(qApp->font().pointSizeF() + 2);
bool boldOpenned = false; bool boldOpenned = false;
for(int i = 0; i < text.length(); i++) {
if((keyword.toUpper()).contains(QString(text.at(i)).toUpper())) { QTextBoundaryFinder bf(QTextBoundaryFinder::Grapheme, text);
int start = 0;
for (;bf.position() != -1; bf.toNextBoundary()) {
int end = bf.position();
if (end == start) {
continue;
}
if (keyword.toUpper().contains(text.mid(start, end - start).toUpper())) {
if(! boldOpenned) { if(! boldOpenned) {
boldOpenned = true; boldOpenned = true;
htmlString.append(QString("<span>")); htmlString.append(QString("<span>"));
} }
htmlString.append(FileUtils::escapeHtml(QString(text.at(i)))); htmlString.append(FileUtils::escapeHtml(text.mid(start, end - start)));
} else { } else {
if(boldOpenned) { if(boldOpenned) {
boldOpenned = false; boldOpenned = false;
htmlString.append(QString("</span>")); htmlString.append(QString("</span>"));
} }
htmlString.append(FileUtils::escapeHtml(QString(text.at(i)))); htmlString.append(FileUtils::escapeHtml(text.mid(start, end - start)));
} }
start = end;
} }
htmlString.replace("\n", "<br />");//替换换行符 htmlString.replace("\n", "<br />");//替换换行符
return "<pre>" + htmlString + "</pre>"; return "<pre>" + htmlString + "</pre>";
} }
@ -986,176 +1141,6 @@ QString FileUtils::wrapData(QLabel *p_label, const QString &text)
} }
} }
} }
// p_label->setText(wrapText);
return wrapText;
}
QString FileUtils::wrapData(const QString &text, const QString &keyword, bool elideLeft)
{
QString wrapText = text;
QFont boldFont(qApp->font().family());
boldFont.setPointSizeF(qApp->font().pointSizeF() + 2);
boldFont.setWeight(QFont::Bold);
QFontMetricsF boldMetricsF(boldFont);
QFont font(qApp->font().family());
font.setPointSizeF(qApp->font().pointSizeF());
QFontMetricsF fontMetricsF(font);
qreal blockLength = 0;
qreal total = 0;
int lineCount = 0;
int normalLength = 0;
int boldLength = 0;
if (elideLeft) {
for (int i = text.length() - 1; i >= 0; i--) {
if (keyword.toUpper().contains(text.at(i).toUpper())) {
if (normalLength) {
total += fontMetricsF.horizontalAdvance(text.mid(i + 1, normalLength));
normalLength = 0;
blockLength = 0;
}
if (boldLength) {
blockLength = boldMetricsF.horizontalAdvance(text.mid(i + 1, boldLength));
}
boldLength++;
} else {
if (boldLength) {
total += boldMetricsF.horizontalAdvance(text.mid(i + 1, boldLength));
boldLength = 0;
blockLength = 0;
}
if (normalLength) {
blockLength = fontMetricsF.horizontalAdvance(text.mid(i + 1, normalLength));
}
normalLength++;
}
if (!i) {
if (normalLength) {
blockLength = fontMetricsF.horizontalAdvance(text.left(normalLength));
}
if (boldLength) {
blockLength = boldMetricsF.horizontalAdvance(text.left(boldLength));
}
}
if (total + blockLength >= LABEL_MAX_WIDTH) {
i++;
if (total + blockLength > LABEL_MAX_WIDTH) {
if (normalLength) {
normalLength = 1;
} else {
boldLength = 1;
}
} else {
normalLength = 0;
boldLength = 0;
}
wrapText.insert(i + 1, '\n');
lineCount++;
total = 0;
blockLength = 0;
}
if (lineCount == 2) {
QString leftWord = text.left(i + 1);
if (!leftWord.isEmpty()) {
qreal distance = 2;//2是换行符加上要换第一个字
qreal wordSize = 0;
for (int index = i + 1; index < text.length(); index++) {
wordSize += keyword.toUpper().contains(text.at(index).toUpper()) ?
boldMetricsF.horizontalAdvance(text.at(index)) : fontMetricsF.horizontalAdvance(text.at(index));
if (wordSize < fontMetricsF.horizontalAdvance("")) {
distance++;//字长度比省略号要小,可能会挡上,所以再截一个字
} else {
break;
}
}
wrapText = wrapText.right(wrapText.size() - leftWord.size() - distance);
wrapText.prepend("");
}
break;
}
}
} else {
for (int i = 0; i <= text.length(); i++) {
if (i == text.length()) {
if (normalLength) {
blockLength = fontMetricsF.horizontalAdvance(text.right(normalLength));
}
if (boldLength) {
blockLength = boldMetricsF.horizontalAdvance(text.right(boldLength));
}
} else {
if (keyword.toUpper().contains(text.at(i).toUpper())) {
if (normalLength) {
total += fontMetricsF.horizontalAdvance(text.mid(i - normalLength, normalLength));
normalLength = 0;
blockLength = 0;
}
if (boldLength) {
blockLength = boldMetricsF.horizontalAdvance(text.mid(i - boldLength, boldLength));
}
boldLength++;
} else {
if (boldLength) {
total += boldMetricsF.horizontalAdvance(text.mid(i - boldLength, boldLength));
boldLength = 0;
blockLength = 0;
}
if (normalLength) {
blockLength = fontMetricsF.horizontalAdvance(text.mid(i - normalLength, normalLength));
}
normalLength++;
}
}
if (total + blockLength >= LABEL_MAX_WIDTH) {
i--;
if (total + blockLength > LABEL_MAX_WIDTH) {
wrapText.insert(i + lineCount, '\n');
if (normalLength) {
normalLength = 1;
} else {
boldLength = 1;
}
} else {
wrapText.insert(i + 1 + lineCount, '\n');
normalLength = 0;
boldLength = 0;
}
lineCount++;
total = 0;
blockLength = 0;
}
if (lineCount == 2) {
QString leftWord = text.mid(i);
if (!leftWord.isEmpty()) {
qreal distance = 2;
qreal wordSize = 0;
for (int index = i; index > 0; index--) {
wordSize += keyword.toUpper().contains(text.at(index).toUpper()) ?
boldMetricsF.horizontalAdvance(text.at(index)) : fontMetricsF.horizontalAdvance(text.at(index));
if (wordSize < fontMetricsF.horizontalAdvance("")) {
distance++;
} else {
break;
}
}
wrapText = wrapText.left(wrapText.size() - leftWord.size() - distance);
wrapText.append("");
}
break;
}
}
}
return wrapText; return wrapText;
} }
@ -1169,25 +1154,37 @@ qreal FileUtils::horizontalAdvanceContainsKeyword(const QString &content, const
QFont font(qApp->font().family()); QFont font(qApp->font().family());
font.setPointSizeF(qApp->font().pointSizeF()); font.setPointSizeF(qApp->font().pointSizeF());
QFontMetricsF fontMetricsF(font); QFontMetricsF fontMetricsF(font);
QTextBoundaryFinder fm(QTextBoundaryFinder::Grapheme, content);
int start = 0;
qreal contentSize = 0; qreal contentSize = 0;
int boldLength = 0; int boldLength = 0;
int normalLength = 0; int normalLength = 0;
for (int i = 0; i < content.length(); i++) { for (;fm.position() != -1;fm.toNextBoundary()) {
if (keyword.toUpper().contains(content.at(i).toUpper())) { int end = fm.position();
boldLength++; if (end == start) {
continue;
}
QString letter = content.mid(start, end - start);
if (keyword.toUpper().contains(letter.toUpper())) {
if (normalLength) { if (normalLength) {
contentSize += boldMetricsF.horizontalAdvance(content.mid(i - normalLength, normalLength)); contentSize += fontMetricsF.horizontalAdvance(content.mid(start - normalLength, normalLength));
normalLength = 0; normalLength = 0;
} }
boldLength += (end - start);
} else { } else {
normalLength++;
if (boldLength) { if (boldLength) {
contentSize += boldMetricsF.horizontalAdvance(content.mid(i - boldLength, boldLength)); contentSize += boldMetricsF.horizontalAdvance(content.mid(start - boldLength, boldLength));
boldLength = 0; boldLength = 0;
} }
normalLength += (end - start);
} }
start = end;
} }
if (boldLength) { if (boldLength) {
contentSize += boldMetricsF.horizontalAdvance(content.right(boldLength)); contentSize += boldMetricsF.horizontalAdvance(content.right(boldLength));
} }

View File

@ -34,7 +34,6 @@ public:
static QString getHtmlText(const QString &text, const QString &keyword); static QString getHtmlText(const QString &text, const QString &keyword);
static QString setAllTextBold(const QString &name); static QString setAllTextBold(const QString &name);
static QString wrapData(QLabel *p_label, const QString &text); static QString wrapData(QLabel *p_label, const QString &text);
static QString wrapData(const QString &text, const QString &keyword, bool elideLeft = false);
static qreal horizontalAdvanceContainsKeyword(const QString &content, const QString &keyword); static qreal horizontalAdvanceContainsKeyword(const QString &content, const QString &keyword);
static std::string makeDocUterm(QString path); static std::string makeDocUterm(QString path);
static QIcon getFileIcon(const QString &uri, bool checkValid = true); static QIcon getFileIcon(const QString &uri, bool checkValid = true);
@ -64,7 +63,7 @@ public:
static int openFile(QString &path, bool openInDir = false); static int openFile(QString &path, bool openInDir = false);
static bool copyPath(QString &path); static bool copyPath(QString &path);
static QString escapeHtml(const QString &str); static QString escapeHtml(const QString &str);
static QString chineseSubString(const std::string &myStr, uint start, uint length, const QString &keyword); static QString getSnippet(const std::string &myStr, uint start, const QString &keyword);
static QIcon iconFromTheme(const QString &name, const QIcon &iconDefault); static QIcon iconFromTheme(const QString &name, const QIcon &iconDefault);
static bool isOpenXMLFileEncrypted(const QString &path); static bool isOpenXMLFileEncrypted(const QString &path);
/** /**

View File

@ -355,7 +355,7 @@ int FileContentSearch::getResult(Xapian::MSet &result, std::string &keyWord) {
} }
} }
auto pos = termIterator.positionlist_begin(); auto pos = termIterator.positionlist_begin();
QString snippet = FileUtils::chineseSubString(data, *pos, 120, QString::fromStdString(keyWord).remove(" ")); QString snippet = FileUtils::getSnippet(data, *pos, QString::fromStdString(keyWord).remove(" "));
ri.description.prepend(SearchPluginIface::DescriptionInfo{"",FileUtils::getHtmlText(snippet, QString::fromStdString(keyWord).remove(" "))}); ri.description.prepend(SearchPluginIface::DescriptionInfo{"",FileUtils::getHtmlText(snippet, QString::fromStdString(keyWord).remove(" "))});
QString().swap(snippet); QString().swap(snippet);
@ -472,7 +472,7 @@ int OcrSearch::getResult(Xapian::MSet &result, std::string &keyWord) {
term.skip_to(wordTobeFound); term.skip_to(wordTobeFound);
//fix me: make a snippet without cut cjk char. //fix me: make a snippet without cut cjk char.
auto pos = term.positionlist_begin(); auto pos = term.positionlist_begin();
QString snippet = FileUtils::chineseSubString(data, *pos, 120, QString::fromStdString(keyWord).remove(" ")); QString snippet = FileUtils::getSnippet(data, *pos, QString::fromStdString(keyWord).remove(" "));
ri.description.prepend(SearchPluginIface::DescriptionInfo{"", FileUtils::getHtmlText(snippet, QString::fromStdString(keyWord).remove(" "))}); ri.description.prepend(SearchPluginIface::DescriptionInfo{"", FileUtils::getHtmlText(snippet, QString::fromStdString(keyWord).remove(" "))});
QString().swap(snippet); QString().swap(snippet);