mirror of https://gitee.com/openkylin/doxygen.git
387 lines
11 KiB
C++
387 lines
11 KiB
C++
/******************************************************************************
|
|
*
|
|
* Copyright (C) 1997-2015 by Dimitri van Heesch.
|
|
*
|
|
* Permission to use, copy, modify, and distribute this software and its
|
|
* documentation under the terms of the GNU General Public License is hereby
|
|
* granted. No representations are made about the suitability of this software
|
|
* for any purpose. It is provided "as is" without express or implied warranty.
|
|
* See the GNU General Public License for more details.
|
|
*
|
|
* Documents produced by Doxygen are derivative works derived from the
|
|
* input used in their production; they are not affected by this license.
|
|
*
|
|
*/
|
|
|
|
// STL includes
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <iostream>
|
|
#include <string>
|
|
#include <algorithm>
|
|
#include <sstream>
|
|
#include <fstream>
|
|
#include <iterator>
|
|
#include <regex>
|
|
|
|
#include <sys/stat.h>
|
|
|
|
// Xapian include
|
|
#include <xapian.h>
|
|
|
|
#include "version.h"
|
|
#include "xml.h"
|
|
|
|
#define MAX_TERM_LENGTH 245
|
|
|
|
#if defined(_WIN32) && !defined(__CYGWIN__)
|
|
static char pathSep = '\\';
|
|
#else
|
|
static char pathSep = '/';
|
|
#endif
|
|
|
|
static void safeAddTerm(const std::string &term,Xapian::Document &doc,int wfd)
|
|
{
|
|
if (term.length()<=MAX_TERM_LENGTH) doc.add_term(term,wfd);
|
|
}
|
|
|
|
/** trims \a whitespace characters from the start and end of string \a str. */
|
|
static std::string trim(const std::string& str,
|
|
const std::string& whitespace = " \t")
|
|
{
|
|
size_t strBegin = str.find_first_not_of(whitespace);
|
|
if (strBegin == std::string::npos)
|
|
return ""; // no content
|
|
|
|
size_t strEnd = str.find_last_not_of(whitespace);
|
|
size_t strRange = strEnd - strBegin + 1;
|
|
|
|
return str.substr(strBegin, strRange);
|
|
}
|
|
|
|
/** trims \a whitespace from start and end and replace occurrences of
|
|
* \a whitespace with \a fill.
|
|
*/
|
|
static std::string reduce(const std::string& str,
|
|
const std::string& fill = " ",
|
|
const std::string& whitespace = " \t")
|
|
{
|
|
// trim first
|
|
std::string result = trim(str, whitespace);
|
|
|
|
// replace sub ranges
|
|
size_t beginSpace = result.find_first_of(whitespace);
|
|
while (beginSpace != std::string::npos)
|
|
{
|
|
size_t endSpace = result.find_first_not_of(whitespace, beginSpace);
|
|
size_t range = endSpace - beginSpace;
|
|
|
|
result.replace(beginSpace, range, fill);
|
|
|
|
size_t newStart = beginSpace + fill.length();
|
|
beginSpace = result.find_first_of(whitespace, newStart);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/** Adds all words in \a s to document \a doc with weight \a wfd */
|
|
static void addWords(const std::string &s,Xapian::Document &doc,int wfd)
|
|
{
|
|
std::istringstream iss(s);
|
|
std::istream_iterator<std::string> begin(iss),end,it;
|
|
for (it=begin;it!=end;++it)
|
|
{
|
|
std::string word = *it;
|
|
std::string lword = word;
|
|
std::transform(lword.begin(), lword.end(), lword.begin(), ::tolower);
|
|
safeAddTerm(word,doc,wfd);
|
|
if (lword!=word)
|
|
{
|
|
safeAddTerm(lword,doc,wfd);
|
|
}
|
|
}
|
|
}
|
|
|
|
/** Adds all identifiers in \a s to document \a doc with weight \a wfd */
|
|
static void addIdentifiers(const std::string &s,Xapian::Document &doc,int wfd)
|
|
{
|
|
std::regex id_re("[A-Z_a-z][A-Z_a-z0-9]*");
|
|
auto id_begin = std::sregex_iterator(s.begin(), s.end(), id_re);
|
|
auto id_end = std::sregex_iterator();
|
|
|
|
for (auto i = id_begin; i!=id_end; ++i)
|
|
{
|
|
std::smatch match = *i;
|
|
safeAddTerm(match.str(),doc,wfd);
|
|
}
|
|
}
|
|
|
|
/** Replaces all occurrences of \a old with \a repl in string \a str */
|
|
static void replace_all(std::string& str, const std::string& old, const std::string& repl)
|
|
{
|
|
size_t pos = 0;
|
|
while ((pos = str.find(old, pos)) != std::string::npos)
|
|
{
|
|
str.replace(pos, old.length(), repl);
|
|
pos += repl.length();
|
|
}
|
|
}
|
|
|
|
/** Replaces all XML entities in \a s with their unescaped representation */
|
|
static std::string unescapeXmlEntities(const std::string &s)
|
|
{
|
|
std::string result=s;
|
|
replace_all(result,">",">");
|
|
replace_all(result,"<","<");
|
|
replace_all(result,"'","'");
|
|
replace_all(result,""","\"");
|
|
replace_all(result,"&","&");
|
|
return result;
|
|
}
|
|
|
|
/** This class is a wrapper around SAX style XML parser, which
|
|
* parses the file without first building a DOM tree in memory.
|
|
*/
|
|
class XMLContentHandler
|
|
{
|
|
public:
|
|
/** Handler for parsing XML data */
|
|
XMLContentHandler(const std::string &path)
|
|
: m_db(path+"doxysearch.db",Xapian::DB_CREATE_OR_OVERWRITE),
|
|
m_stemmer("english")
|
|
{
|
|
m_curFieldName = UnknownField;
|
|
m_indexer.set_stemmer(m_stemmer);
|
|
m_indexer.set_document(m_doc);
|
|
}
|
|
|
|
/** Free data handler */
|
|
~XMLContentHandler()
|
|
{
|
|
m_db.commit();
|
|
}
|
|
|
|
enum FieldNames
|
|
{
|
|
UnknownField = 0,
|
|
TypeField = 1,
|
|
NameField = 2,
|
|
ArgsField = 3,
|
|
TagField = 4,
|
|
UrlField = 5,
|
|
KeywordField = 6,
|
|
TextField = 7
|
|
};
|
|
|
|
/** Handler for a start tag. Called for <doc> and <field> tags */
|
|
void startElement(const std::string &name, const XMLHandlers::Attributes &attrib)
|
|
{
|
|
m_data="";
|
|
if (name=="field")
|
|
{
|
|
std::string fieldName = XMLHandlers::value(attrib,"name");
|
|
if (fieldName=="type") m_curFieldName=TypeField;
|
|
else if (fieldName=="name") m_curFieldName=NameField;
|
|
else if (fieldName=="args") m_curFieldName=ArgsField;
|
|
else if (fieldName=="tag") m_curFieldName=TagField;
|
|
else if (fieldName=="url") m_curFieldName=UrlField;
|
|
else if (fieldName=="keywords") m_curFieldName=KeywordField;
|
|
else if (fieldName=="text") m_curFieldName=TextField;
|
|
else m_curFieldName=UnknownField;
|
|
}
|
|
}
|
|
|
|
/** Handler for an end tag. Called for </doc> and </field> tags */
|
|
void endElement(const std::string &name)
|
|
{
|
|
if (name=="doc") // </doc>
|
|
{
|
|
std::string term = m_doc.get_value(NameField);
|
|
std::string partTerm;
|
|
size_t pos = term.rfind("::");
|
|
if (pos!=std::string::npos)
|
|
{
|
|
partTerm = term.substr(pos+2);
|
|
}
|
|
if (m_doc.get_value(TypeField)=="class" ||
|
|
m_doc.get_value(TypeField)=="file" ||
|
|
m_doc.get_value(TypeField)=="namespace") // containers get highest prio
|
|
{
|
|
safeAddTerm(term,m_doc,1000);
|
|
if (!partTerm.empty())
|
|
{
|
|
safeAddTerm(partTerm,m_doc,500);
|
|
}
|
|
}
|
|
else // members and others get lower prio
|
|
{
|
|
safeAddTerm(m_doc.get_value(NameField),m_doc,100);
|
|
if (!partTerm.empty())
|
|
{
|
|
safeAddTerm(partTerm,m_doc,50);
|
|
}
|
|
}
|
|
m_db.add_document(m_doc);
|
|
m_doc.clear_values();
|
|
m_doc.clear_terms();
|
|
}
|
|
else if (name=="field" && m_curFieldName!=UnknownField) // </field>
|
|
{
|
|
// strip whitespace from m_data
|
|
m_data = reduce(m_data);
|
|
// replace XML entities
|
|
m_data = unescapeXmlEntities(m_data);
|
|
// add data to the document
|
|
m_doc.add_value(m_curFieldName,m_data);
|
|
switch (m_curFieldName)
|
|
{
|
|
case TypeField:
|
|
case NameField:
|
|
case TagField:
|
|
case UrlField:
|
|
// meta data that is not searchable
|
|
break;
|
|
case KeywordField:
|
|
addWords(m_data,m_doc,50);
|
|
break;
|
|
case ArgsField:
|
|
addIdentifiers(m_data,m_doc,10);
|
|
break;
|
|
case TextField:
|
|
addWords(m_data,m_doc,2);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
m_data="";
|
|
m_curFieldName=UnknownField;
|
|
}
|
|
// reset m_data
|
|
}
|
|
|
|
/** Handler for inline text */
|
|
void characters(const std::string& ch)
|
|
{
|
|
m_data += ch;
|
|
}
|
|
|
|
void error(const std::string &fileName,int lineNr,const std::string &msg)
|
|
{
|
|
std::cerr << "Fatal error at " << fileName << ":" << lineNr << ": " << msg << std::endl;
|
|
}
|
|
|
|
private:
|
|
|
|
// internal state
|
|
Xapian::WritableDatabase m_db;
|
|
Xapian::Document m_doc;
|
|
Xapian::TermGenerator m_indexer;
|
|
Xapian::Stem m_stemmer;
|
|
std::string m_data;
|
|
FieldNames m_curFieldName;
|
|
};
|
|
|
|
static void usage(const char *name, int exitVal = 1)
|
|
{
|
|
std::cerr << "Usage: " << name << " [-o output_dir] searchdata.xml [searchdata2.xml ...]" << std::endl;
|
|
exit(exitVal);
|
|
}
|
|
|
|
// return the contents of a file as a string
|
|
inline std::string fileToString(const std::string &fileName)
|
|
{
|
|
std::ifstream t(fileName);
|
|
std::string result;
|
|
t.seekg(0, std::ios::end);
|
|
result.reserve(t.tellg());
|
|
t.seekg(0, std::ios::beg);
|
|
result.assign(std::istreambuf_iterator<char>(t),
|
|
std::istreambuf_iterator<char>());
|
|
return result;
|
|
}
|
|
|
|
bool dirExists(const char *path)
|
|
{
|
|
struct stat info = {};
|
|
return stat(path,&info)==0 && (info.st_mode&S_IFDIR);
|
|
}
|
|
|
|
/** main function to index data */
|
|
int main(int argc,const char **argv)
|
|
{
|
|
if (argc<2)
|
|
{
|
|
usage(argv[0]);
|
|
}
|
|
std::string outputDir;
|
|
for (int i=1;i<argc;i++)
|
|
{
|
|
if (std::string(argv[i])=="-o")
|
|
{
|
|
if (i>=argc-1)
|
|
{
|
|
std::cerr << "Error: missing parameter for -o option" << std::endl;
|
|
usage(argv[0]);
|
|
}
|
|
else
|
|
{
|
|
i++;
|
|
outputDir=argv[i];
|
|
if (!dirExists(outputDir.c_str()))
|
|
{
|
|
std::cerr << "Error: specified output directory does not exist!" << std::endl;
|
|
usage(argv[0]);
|
|
}
|
|
}
|
|
}
|
|
else if (std::string(argv[i])=="-h" || std::string(argv[i])=="--help")
|
|
{
|
|
usage(argv[0],0);
|
|
}
|
|
else if (std::string(argv[i])=="-v" || std::string(argv[i])=="--version")
|
|
{
|
|
std::cerr << argv[0] << " version: " << getFullVersion() << std::endl;
|
|
exit(0);
|
|
}
|
|
}
|
|
|
|
try
|
|
{
|
|
if (!outputDir.empty() && outputDir.at(outputDir.length()-1)!=pathSep)
|
|
{
|
|
outputDir+=pathSep;
|
|
}
|
|
XMLContentHandler contentHandler(outputDir);
|
|
XMLHandlers handlers;
|
|
handlers.startElement = [&contentHandler](const std::string &name,const XMLHandlers::Attributes &attrs) { contentHandler.startElement(name,attrs); };
|
|
handlers.endElement = [&contentHandler](const std::string &name) { contentHandler.endElement(name); };
|
|
handlers.characters = [&contentHandler](const std::string &chars) { contentHandler.characters(chars); };
|
|
handlers.error = [&contentHandler](const std::string &fileName,int lineNr,const std::string &msg) { contentHandler.error(fileName,lineNr,msg); };
|
|
for (int i=1;i<argc;i++)
|
|
{
|
|
if (std::string(argv[i])=="-o")
|
|
{
|
|
i++;
|
|
}
|
|
else
|
|
{
|
|
std::cout << "Processing " << argv[i] << "..." << std::endl;
|
|
std::string inputStr = fileToString(argv[i]);
|
|
XMLParser parser(handlers);
|
|
parser.parse(argv[i],inputStr.c_str(),false);
|
|
}
|
|
}
|
|
}
|
|
catch(const Xapian::Error &e)
|
|
{
|
|
std::cerr << "Caught exception: " << e.get_description() << std::endl;
|
|
}
|
|
catch(...)
|
|
{
|
|
std::cerr << "Caught an unknown exception" << std::endl;
|
|
}
|
|
|
|
return 0;
|
|
}
|