Remove parsers in included hunspell

This commit is contained in:
Jean-Marc Lasgouttes 2020-12-20 19:39:35 +01:00
parent 8d0d3ea090
commit d04a8cf58f
17 changed files with 0 additions and 1755 deletions

View File

@ -1,18 +0,0 @@
AM_CPPFLAGS=-I${top_builddir}/src/hunspell
noinst_LIBRARIES=libparsers.a
libparsers_a_SOURCES=firstparser.cxx xmlparser.cxx \
latexparser.cxx manparser.cxx \
textparser.cxx htmlparser.cxx \
odfparser.cxx
noinst_PROGRAMS=testparser
testparser_SOURCES=firstparser.cxx firstparser.hxx xmlparser.cxx \
xmlparser.hxx latexparser.cxx latexparser.hxx \
manparser.cxx manparser.hxx testparser.cxx \
textparser.cxx textparser.hxx htmlparser.cxx \
htmlparser.hxx odfparser.hxx odfparser.cxx
# need mystrdup()
LDADD = ../hunspell/libhunspell-1.7.la

View File

@ -1,65 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <cstdlib>
#include <cstring>
#include <cstdio>
#include <ctype.h>
#include "../hunspell/csutil.hxx"
#include "firstparser.hxx"
#ifndef W32
using namespace std;
#endif
FirstParser::FirstParser(const char* wordchars)
: TextParser(wordchars) {
}
FirstParser::~FirstParser() {}
bool FirstParser::next_token(std::string& t) {
t.clear();
const size_t tabpos = line[actual].find('\t');
if (tabpos != std::string::npos && tabpos > token) {
token = tabpos;
t = line[actual].substr(0, tabpos);
return true;
}
return false;
}

View File

@ -1,56 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef FIRSTPARSER_HXX_
#define FIRSTPARSER_HXX_
#include "textparser.hxx"
/*
* Check first word of the input line
*
*/
class FirstParser : public TextParser {
public:
explicit FirstParser(const char* wc);
virtual ~FirstParser();
virtual bool next_token(std::string&);
};
#endif

View File

@ -1,88 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <cstdlib>
#include <cstring>
#include <cstdio>
#include <ctype.h>
#include "../hunspell/csutil.hxx"
#include "htmlparser.hxx"
#ifndef W32
using namespace std;
#endif
static const char* PATTERN[][2] = {{"<script", "</script>"},
{"<style", "</style>"},
{"<code", "</code>"},
{"<samp", "</samp>"},
{"<kbd", "</kbd>"},
{"<var", "</var>"},
{"<listing", "</listing>"},
{"<address", "</address>"},
{"<pre", "</pre>"},
{"<!--", "-->"},
{"<[cdata[", "]]>"}, // XML comment
{"<", ">"}};
#define PATTERN_LEN (sizeof(PATTERN) / (sizeof(char*) * 2))
static const char* PATTERN2[][2] = {
{"<img", "alt="}, // ALT and TITLE attrib handled spec.
{"<img", "title="},
{"<a ", "title="}};
#define PATTERN_LEN2 (sizeof(PATTERN2) / (sizeof(char*) * 2))
static const char* (*PATTERN3)[2] = NULL;
#define PATTERN_LEN3 0
HTMLParser::HTMLParser(const char* wordchars)
: XMLParser(wordchars) {
}
HTMLParser::HTMLParser(const w_char* wordchars, int len)
: XMLParser(wordchars, len) {
}
bool HTMLParser::next_token(std::string& t) {
return XMLParser::next_token(PATTERN, PATTERN_LEN, PATTERN2, PATTERN_LEN2, PATTERN3, PATTERN_LEN3, t);
}
HTMLParser::~HTMLParser() {}

View File

@ -1,56 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef HTMLPARSER_HXX_
#define HTMLPARSER_HXX_
#include "xmlparser.hxx"
/*
* HTML Parser
*
*/
class HTMLParser : public XMLParser {
public:
explicit HTMLParser(const char* wc);
HTMLParser(const w_char* wordchars, int len);
virtual bool next_token(std::string&);
virtual ~HTMLParser();
};
#endif

View File

@ -1,277 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <cstdlib>
#include <cstring>
#include <cstdio>
#include <ctype.h>
#include "../hunspell/csutil.hxx"
#include "latexparser.hxx"
#ifndef W32
using namespace std;
#endif
#define UTF8_APOS "\xe2\x80\x99"
#define APOSTROPHE "'"
static struct {
const char* pat[2];
int arg;
} PATTERN[] = {{{"\\(", "\\)"}, 0},
{{"$$", "$$"}, 0},
{{"$", "$"}, 0},
{{"\\begin{math}", "\\end{math}"}, 0},
{{"\\[", "\\]"}, 0},
{{"\\begin{displaymath}", "\\end{displaymath}"}, 0},
{{"\\begin{equation}", "\\end{equation}"}, 0},
{{"\\begin{equation*}", "\\end{equation*}"}, 0},
{{"\\cite", NULL}, 1},
{{"\\nocite", NULL}, 1},
{{"\\index", NULL}, 1},
{{"\\label", NULL}, 1},
{{"\\ref", NULL}, 1},
{{"\\pageref", NULL}, 1},
{{"\\autoref", NULL}, 1},
{{"\\parbox", NULL}, 1},
{{"\\begin{verbatim}", "\\end{verbatim}"}, 0},
{{"\\verb+", "+"}, 0},
{{"\\verb|", "|"}, 0},
{{"\\verb#", "#"}, 0},
{{"\\verb*", "*"}, 0},
{{"\\documentstyle", "\\begin{document}"}, 0},
{{"\\documentclass", "\\begin{document}"}, 0},
// { { "\\documentclass", NULL } , 1 },
{{"\\usepackage", NULL}, 1},
{{"\\includeonly", NULL}, 1},
{{"\\include", NULL}, 1},
{{"\\input", NULL}, 1},
{{"\\vspace", NULL}, 1},
{{"\\setlength", NULL}, 2},
{{"\\addtolength", NULL}, 2},
{{"\\settowidth", NULL}, 2},
{{"\\rule", NULL}, 2},
{{"\\hspace", NULL}, 1},
{{"\\vspace", NULL}, 1},
{{"\\\\[", "]"}, 0},
{{"\\pagebreak[", "]"}, 0},
{{"\\nopagebreak[", "]"}, 0},
{{"\\enlargethispage", NULL}, 1},
{{"\\begin{tabular}", NULL}, 1},
{{"\\addcontentsline", NULL}, 2},
{{"\\begin{thebibliography}", NULL}, 1},
{{"\\bibliography", NULL}, 1},
{{"\\bibliographystyle", NULL}, 1},
{{"\\bibitem", NULL}, 1},
{{"\\begin", NULL}, 1},
{{"\\end", NULL}, 1},
{{"\\pagestyle", NULL}, 1},
{{"\\pagenumbering", NULL}, 1},
{{"\\thispagestyle", NULL}, 1},
{{"\\newtheorem", NULL}, 2},
{{"\\newcommand", NULL}, 2},
{{"\\renewcommand", NULL}, 2},
{{"\\setcounter", NULL}, 2},
{{"\\addtocounter", NULL}, 1},
{{"\\stepcounter", NULL}, 1},
{{"\\selectlanguage", NULL}, 1},
{{"\\inputencoding", NULL}, 1},
{{"\\hyphenation", NULL}, 1},
{{"\\definecolor", NULL}, 3},
{{"\\color", NULL}, 1},
{{"\\textcolor", NULL}, 1},
{{"\\pagecolor", NULL}, 1},
{{"\\colorbox", NULL}, 2},
{{"\\fcolorbox", NULL}, 2},
{{"\\declaregraphicsextensions", NULL}, 1},
{{"\\psfig", NULL}, 1},
{{"\\url", NULL}, 1},
{{"\\eqref", NULL}, 1},
{{"\\vskip", NULL}, 1},
{{"\\vglue", NULL}, 1},
{{"\'\'", NULL}, 1}};
#define PATTERN_LEN (sizeof(PATTERN) / sizeof(PATTERN[0]))
LaTeXParser::LaTeXParser(const char* wordchars)
: TextParser(wordchars)
, pattern_num(0), depth(0), arg(0), opt(0) {
}
LaTeXParser::LaTeXParser(const w_char* wordchars, int len)
: TextParser(wordchars, len)
, pattern_num(0), depth(0), arg(0), opt(0) {
}
LaTeXParser::~LaTeXParser() {}
int LaTeXParser::look_pattern(int col) {
for (unsigned int i = 0; i < PATTERN_LEN; i++) {
const char* j = line[actual].c_str() + head;
const char* k = PATTERN[i].pat[col];
if (!k)
continue;
while ((*k != '\0') && (tolower(*j) == *k)) {
j++;
k++;
}
if (*k == '\0')
return i;
}
return -1;
}
/*
* LaTeXParser
*
* state 0: not wordchar
* state 1: wordchar
* state 2: comments
* state 3: commands
* state 4: commands with arguments
* state 5: % comment
*
*/
bool LaTeXParser::next_token(std::string& t) {
t.clear();
int i;
int slash = 0;
int apostrophe;
for (;;) {
// fprintf(stderr,"depth: %d, state: %d, , arg: %d, token:
// %s\n",depth,state,arg,line[actual]+head);
switch (state) {
case 0: // non word chars
if ((pattern_num = look_pattern(0)) != -1) {
if (PATTERN[pattern_num].pat[1]) {
state = 2;
} else {
state = 4;
depth = 0;
arg = 0;
opt = 1;
}
head += strlen(PATTERN[pattern_num].pat[0]) - 1;
} else if (line[actual][head] == '%') {
state = 5;
} else if (is_wordchar(line[actual].c_str() + head)) {
state = 1;
token = head;
} else if (line[actual][head] == '\\') {
if (line[actual][head + 1] == '\\' || // \\ (linebreak)
(line[actual][head + 1] == '$') || // \$ (dollar sign)
(line[actual][head + 1] == '%')) { // \% (percent)
head++;
break;
}
state = 3;
}
break;
case 1: // wordchar
apostrophe = 0;
if ((is_wordchar((char*)APOSTROPHE) ||
(is_utf8() && is_wordchar((char*)UTF8_APOS))) &&
!line[actual].empty() && line[actual][head] == '\'' &&
is_wordchar(line[actual].c_str() + head + 1)) {
head++;
} else if (is_utf8() &&
is_wordchar((char*)APOSTROPHE) && // add Unicode apostrophe
// to the WORDCHARS, if
// needed
strncmp(line[actual].c_str() + head, UTF8_APOS, strlen(UTF8_APOS)) ==
0 &&
is_wordchar(line[actual].c_str() + head + strlen(UTF8_APOS))) {
head += strlen(UTF8_APOS) - 1;
} else if (!is_wordchar(line[actual].c_str() + head) ||
(line[actual][head] == '\'' && line[actual][head + 1] == '\'' &&
++apostrophe)) {
state = 0;
bool ok = alloc_token(token, &head, t);
if (apostrophe)
head += 2;
if (ok)
return true;
}
break;
case 2: // comment, labels, etc
if (((i = look_pattern(1)) != -1) &&
(strcmp(PATTERN[i].pat[1], PATTERN[pattern_num].pat[1]) == 0)) {
state = 0;
head += strlen(PATTERN[pattern_num].pat[1]) - 1;
}
break;
case 3: // command
if ((tolower(line[actual][head]) < 'a') ||
(tolower(line[actual][head]) > 'z')) {
state = 0;
head--;
}
break;
case 4: // command with arguments
if (slash && (line[actual][head] != '\0')) {
slash = 0;
head++;
break;
} else if (line[actual][head] == '\\') {
slash = 1;
} else if ((line[actual][head] == '{') ||
((opt) && (line[actual][head] == '['))) {
depth++;
opt = 0;
} else if (line[actual][head] == '}') {
depth--;
if (depth == 0) {
opt = 1;
arg++;
}
if (((depth == 0) && (arg == PATTERN[pattern_num].arg)) ||
(depth < 0)) {
state = 0; // XXX not handles the last optional arg.
}
} else if (line[actual][head] == ']')
depth--;
} // case
if (next_char(line[actual].c_str(), &head)) {
if (state == 5)
state = 0;
return false;
}
}
}

View File

@ -1,65 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef LATEXPARSER_HXX_
#define LATEXPARSER_HXX_
#include "textparser.hxx"
/*
* HTML Parser
*
*/
class LaTeXParser : public TextParser {
int pattern_num; // number of comment
int depth; // depth of blocks
int arg; // arguments's number
int opt; // optional argument attrib.
public:
explicit LaTeXParser(const char* wc);
LaTeXParser(const w_char* wordchars, int len);
virtual ~LaTeXParser();
virtual bool next_token(std::string&);
private:
int look_pattern(int col);
};
#endif

View File

@ -1,98 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <cstdlib>
#include <cstring>
#include <cstdio>
#include <ctype.h>
#include "../hunspell/csutil.hxx"
#include "manparser.hxx"
#ifndef W32
using namespace std;
#endif
ManParser::ManParser(const char* wordchars)
: TextParser(wordchars) {
}
ManParser::ManParser(const w_char* wordchars, int len)
: TextParser(wordchars, len) {
}
ManParser::~ManParser() {}
bool ManParser::next_token(std::string& t) {
for (;;) {
switch (state) {
case 1: // command arguments
if (line[actual][head] == ' ')
state = 2;
break;
case 0: // dot in begin of line
if (line[actual][0] == '.') {
state = 1;
break;
} else {
state = 2;
}
/* FALLTHROUGH */
case 2: // non word chars
if (is_wordchar(line[actual].c_str() + head)) {
state = 3;
token = head;
} else if ((line[actual][head] == '\\') &&
(line[actual][head + 1] == 'f') &&
(line[actual][head + 2] != '\0')) {
head += 2;
}
break;
case 3: // wordchar
if (!is_wordchar(line[actual].c_str() + head)) {
state = 2;
if (alloc_token(token, &head, t))
return true;
}
break;
}
if (next_char(line[actual].c_str(), &head)) {
state = 0;
return false;
}
}
}

View File

@ -1,58 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef MANPARSER_HXX_
#define MANPARSER_HXX_
#include "textparser.hxx"
/*
* Manparse Parser
*
*/
class ManParser : public TextParser {
protected:
public:
explicit ManParser(const char* wc);
ManParser(const w_char* wordchars, int len);
virtual ~ManParser();
virtual bool next_token(std::string&);
};
#endif

View File

@ -1,86 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <cstdlib>
#include <cstring>
#include <cstdio>
#include <ctype.h>
#include "../hunspell/csutil.hxx"
#include "odfparser.hxx"
#ifndef W32
using namespace std;
#endif
static const char* PATTERN[][2] = {
{"<office:meta>", "</office:meta>"},
{"<office:settings>", "</office:settings>"},
{"<office:binary-data>", "</office:binary-data>"},
{"<!--", "-->"},
{"<[cdata[", "]]>"}, // XML comment
{"<", ">"}};
#define PATTERN_LEN (sizeof(PATTERN) / (sizeof(char*) * 2))
static const char* (*PATTERN2)[2] = NULL;
#define PATTERN_LEN2 0
static const char* PATTERN3[][2] = {
{"<text:span", ">"}, // part of the reedited words
{"</text:span", ">"}}; // for example, an inserted letter
#define PATTERN_LEN3 (sizeof(PATTERN3) / (sizeof(char*) * 2))
ODFParser::ODFParser(const char* wordchars)
: XMLParser(wordchars) {
}
ODFParser::ODFParser(const w_char* wordchars, int len)
: XMLParser(wordchars, len) {
}
bool ODFParser::next_token(std::string& t) {
return XMLParser::next_token(PATTERN, PATTERN_LEN, PATTERN2, PATTERN_LEN2, PATTERN3, PATTERN_LEN3, t);
}
std::string ODFParser::get_word(const std::string &tok) {
return XMLParser::get_word2(PATTERN3, PATTERN_LEN3, tok);
}
ODFParser::~ODFParser() {}

View File

@ -1,57 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef ODFPARSER_HXX_
#define ODFPARSER_HXX_
#include "xmlparser.hxx"
/*
* HTML Parser
*
*/
class ODFParser : public XMLParser {
public:
explicit ODFParser(const char* wc);
ODFParser(const w_char* wordchars, int len);
virtual bool next_token(std::string&);
virtual std::string get_word(const std::string &tok);
virtual ~ODFParser();
};
#endif

View File

@ -1,86 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <cstring>
#include <cstdlib>
#include <cstdio>
#include "textparser.hxx"
#include "htmlparser.hxx"
#include "latexparser.hxx"
#include "xmlparser.hxx"
#ifndef W32
using namespace std;
#endif
int main(int argc, char** argv) {
FILE* f;
/* first parse the command line options */
if (argc < 2) {
fprintf(stderr, "correct syntax is:\n");
fprintf(stderr, "testparser file\n");
fprintf(stderr, "example: testparser /dev/stdin\n");
exit(1);
}
/* open the words to check list */
f = fopen(argv[1], "r");
if (!f) {
fprintf(stderr, "Error - could not open file of words to check\n");
exit(1);
}
TextParser* p = new TextParser(
"qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM");
char buf[MAXLNLEN];
while (fgets(buf, MAXLNLEN, f)) {
p->put_line(buf);
p->set_url_checking(1);
std::string next;
while (p->next_token(next)) {
fprintf(stdout, "token: %s\n", next.c_str());
}
}
delete p;
fclose(f);
return 0;
}

View File

@ -1,302 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <cstdlib>
#include <cstring>
#include <cstdio>
#include <ctype.h>
#include "../hunspell/csutil.hxx"
#include "textparser.hxx"
#include <algorithm>
#ifndef W32
using namespace std;
#endif
// ISO-8859-1 HTML character entities
static const char* LATIN1[] = {
"&Agrave;", "&Atilde;", "&Aring;", "&AElig;", "&Egrave;", "&Ecirc;",
"&Igrave;", "&Iuml;", "&ETH;", "&Ntilde;", "&Ograve;", "&Oslash;",
"&Ugrave;", "&THORN;", "&agrave;", "&atilde;", "&aring;", "&aelig;",
"&egrave;", "&ecirc;", "&igrave;", "&iuml;", "&eth;", "&ntilde;",
"&ograve;", "&oslash;", "&ugrave;", "&thorn;", "&yuml;"};
#define LATIN1_LEN (sizeof(LATIN1) / sizeof(char*))
#define ENTITY_APOS "&apos;"
#define UTF8_APOS "\xe2\x80\x99"
#define APOSTROPHE "'"
TextParser::TextParser(const char* wordchars) {
init(wordchars);
}
TextParser::TextParser(const w_char* wordchars, int len) {
init(wordchars, len);
}
TextParser::~TextParser() {}
int TextParser::is_wordchar(const char* w) {
if (*w == '\0')
return 0;
if (utf8) {
std::vector<w_char> wc;
unsigned short idx;
u8_u16(wc, w);
if (wc.empty())
return 0;
idx = (wc[0].h << 8) + wc[0].l;
return (unicodeisalpha(idx) ||
(wordchars_utf16 &&
std::binary_search(wordchars_utf16, wordchars_utf16 + wclen, wc[0])));
} else {
return wordcharacters[(*w + 256) % 256];
}
}
const char* TextParser::get_latin1(const char* s) {
if (s[0] == '&') {
unsigned int i = 0;
while ((i < LATIN1_LEN) && strncmp(LATIN1[i], s, strlen(LATIN1[i])))
i++;
if (i != LATIN1_LEN)
return LATIN1[i];
}
return NULL;
}
void TextParser::init(const char* wordchars) {
actual = 0;
head = 0;
token = 0;
state = 0;
utf8 = 0;
checkurl = 0;
wordchars_utf16 = NULL;
wclen = 0;
wordcharacters.resize(256, 0);
if (!wordchars)
wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM";
for (unsigned int j = 0; j < strlen(wordchars); ++j) {
wordcharacters[(wordchars[j] + 256) % 256] = 1;
}
}
void TextParser::init(const w_char* wc, int len) {
actual = 0;
head = 0;
token = 0;
state = 0;
utf8 = 1;
checkurl = 0;
wordchars_utf16 = wc;
wclen = len;
}
int TextParser::next_char(const char* ln, size_t* pos) {
if (*(ln + *pos) == '\0')
return 1;
if (utf8) {
if (*(ln + *pos) >> 7) {
// jump to next UTF-8 character
for ((*pos)++; (*(ln + *pos) & 0xc0) == 0x80; (*pos)++)
;
} else {
(*pos)++;
}
} else
(*pos)++;
return 0;
}
void TextParser::put_line(const char* word) {
actual = (actual + 1) % MAXPREVLINE;
line[actual].assign(word);
token = 0;
head = 0;
check_urls();
}
std::string TextParser::get_prevline(int n) const {
return line[(actual + MAXPREVLINE - n) % MAXPREVLINE];
}
std::string TextParser::get_line() const {
return get_prevline(0);
}
bool TextParser::next_token(std::string &t) {
const char* latin1;
for (;;) {
switch (state) {
case 0: // non word chars
if (is_wordchar(line[actual].c_str() + head)) {
state = 1;
token = head;
} else if ((latin1 = get_latin1(line[actual].c_str() + head))) {
state = 1;
token = head;
head += strlen(latin1);
}
break;
case 1: // wordchar
if ((latin1 = get_latin1(line[actual].c_str() + head))) {
head += strlen(latin1);
} else if ((is_wordchar((char*)APOSTROPHE) ||
(is_utf8() && is_wordchar((char*)UTF8_APOS))) &&
!line[actual].empty() && line[actual][head] == '\'' &&
is_wordchar(line[actual].c_str() + head + 1)) {
head++;
} else if (is_utf8() &&
is_wordchar((char*)APOSTROPHE) && // add Unicode apostrophe
// to the WORDCHARS, if
// needed
strncmp(line[actual].c_str() + head, UTF8_APOS, strlen(UTF8_APOS)) ==
0 &&
is_wordchar(line[actual].c_str() + head + strlen(UTF8_APOS))) {
head += strlen(UTF8_APOS) - 1;
} else if (!is_wordchar(line[actual].c_str() + head)) {
state = 0;
if (alloc_token(token, &head, t))
return true;
}
break;
}
if (next_char(line[actual].c_str(), &head))
return false;
}
}
size_t TextParser::get_tokenpos() {
return token;
}
int TextParser::change_token(const char* word) {
if (word) {
std::string remainder(line[actual].substr(head));
line[actual].resize(token);
line[actual].append(word);
line[actual].append(remainder);
head = token;
return 1;
}
return 0;
}
std::string TextParser::get_word(const std::string &tok) {
return tok;
}
void TextParser::check_urls() {
urlline.resize(line[actual].size() + 1);
int url_state = 0;
size_t url_head = 0;
size_t url_token = 0;
int url = 0;
for (;;) {
switch (url_state) {
case 0: // non word chars
if (is_wordchar(line[actual].c_str() + url_head)) {
url_state = 1;
url_token = url_head;
// Unix path
} else if (line[actual][url_head] == '/') {
url_state = 1;
url_token = url_head;
url = 1;
}
break;
case 1: // wordchar
char ch = line[actual][url_head];
// e-mail address
if ((ch == '@') ||
// MS-DOS, Windows path
(strncmp(line[actual].c_str() + url_head, ":\\", 2) == 0) ||
// URL
(strncmp(line[actual].c_str() + url_head, "://", 3) == 0)) {
url = 1;
} else if (!(is_wordchar(line[actual].c_str() + url_head) || (ch == '-') ||
(ch == '_') || (ch == '\\') || (ch == '.') ||
(ch == ':') || (ch == '/') || (ch == '~') || (ch == '%') ||
(ch == '*') || (ch == '$') || (ch == '[') || (ch == ']') ||
(ch == '?') || (ch == '!') ||
((ch >= '0') && (ch <= '9')))) {
url_state = 0;
if (url == 1) {
for (size_t i = url_token; i < url_head; ++i) {
urlline[i] = true;
}
}
url = 0;
}
break;
}
urlline[url_head] = false;
if (next_char(line[actual].c_str(), &url_head))
return;
}
}
int TextParser::get_url(size_t token_pos, size_t* hd) {
for (size_t i = *hd; i < line[actual].size() && urlline[i]; i++, (*hd)++)
;
return checkurl ? 0 : urlline[token_pos];
}
void TextParser::set_url_checking(int check) {
checkurl = check;
}
bool TextParser::alloc_token(size_t tokn, size_t* hd, std::string& t) {
size_t url_head = *hd;
if (get_url(tokn, &url_head))
return false;
t = line[actual].substr(tokn, *hd - tokn);
// remove colon for Finnish and Swedish language
if (!t.empty() && t[t.size() - 1] == ':') {
t.resize(t.size() - 1);
if (t.empty()) {
return false;
}
}
return true;
}

View File

@ -1,99 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef TEXTPARSER_HXX_
#define TEXTPARSER_HXX_
// set sum of actual and previous lines
#define MAXPREVLINE 4
#ifndef MAXLNLEN
#define MAXLNLEN 8192
#endif
#include "../hunspell/w_char.hxx"
#include <vector>
/*
* Base Text Parser
*
*/
class TextParser {
protected:
std::vector<int> wordcharacters;// for detection of the word boundaries
std::string line[MAXPREVLINE]; // parsed and previous lines
std::vector<bool> urlline; // mask for url detection
int checkurl;
int actual; // actual line
size_t head; // head position
size_t token;// begin of token
int state; // state of automata
int utf8; // UTF-8 character encoding
int next_char(const char* line, size_t* pos);
const w_char* wordchars_utf16;
int wclen;
public:
TextParser(const w_char* wordchars, int len);
explicit TextParser(const char* wc);
virtual ~TextParser();
void put_line(const char* line);
std::string get_line() const;
std::string get_prevline(int n) const;
virtual bool next_token(std::string&);
virtual std::string get_word(const std::string &tok);
virtual int change_token(const char* word);
void set_url_checking(int check);
size_t get_tokenpos();
int is_wordchar(const char* w);
inline int is_utf8() { return utf8; }
const char* get_latin1(const char* s);
char* next_char();
int tokenize_urls();
void check_urls();
int get_url(size_t token_pos, size_t* head);
bool alloc_token(size_t token, size_t* head, std::string& out);
private:
void init(const char*);
void init(const w_char* wordchars, int len);
};
#endif

View File

@ -1,253 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include <cstdlib>
#include <cstring>
#include <cstdio>
#include <ctype.h>
#include "../hunspell/csutil.hxx"
#include "xmlparser.hxx"
#ifndef W32
using namespace std;
#endif
enum { ST_NON_WORD, ST_WORD, ST_TAG, ST_CHAR_ENTITY, ST_OTHER_TAG, ST_ATTRIB };
static const char* __PATTERN__[][2] = {{"<!--", "-->"},
{"<[cdata[", "]]>"}, // XML comment
{"<", ">"}};
#define __PATTERN_LEN__ (sizeof(__PATTERN__) / (sizeof(char*) * 2))
// for checking attributes, eg. <img alt="text"> in HTML
static const char* (*__PATTERN2__)[2] = NULL;
#define __PATTERN_LEN2__ 0
// for checking words with in-word patterns
// for example, "exam<text:span>p</text:span>le" in ODT
static const char* (*__PATTERN3__)[2] = NULL;
#define __PATTERN_LEN3__ 0
#define ENTITY_APOS "&apos;"
#define UTF8_APOS "\xe2\x80\x99"
#define APOSTROPHE "'"
XMLParser::XMLParser(const char* wordchars)
: TextParser(wordchars)
, pattern_num(0), pattern2_num(0), pattern3_num(0), prevstate(0), checkattr(0), quotmark(0) {
}
XMLParser::XMLParser(const w_char* wordchars, int len)
: TextParser(wordchars, len)
, pattern_num(0), pattern2_num(0), pattern3_num(0), prevstate(0), checkattr(0), quotmark(0) {
}
XMLParser::~XMLParser() {}
int XMLParser::look_pattern(const char* p[][2], unsigned int len, int column) {
for (unsigned int i = 0; i < len; i++) {
const char* j = line[actual].c_str() + head;
const char* k = p[i][column];
while ((*k != '\0') && (tolower(*j) == *k)) {
j++;
k++;
}
if (*k == '\0')
return i;
}
return -1;
}
/*
* XML parser
*
*/
bool XMLParser::next_token(const char* PATTERN[][2],
unsigned int PATTERN_LEN,
const char* PATTERN2[][2],
unsigned int PATTERN_LEN2,
const char* PATTERN3[][2],
unsigned int PATTERN_LEN3,
std::string& t) {
t.clear();
const char* latin1;
for (;;) {
switch (state) {
case ST_NON_WORD: // non word chars
prevstate = ST_NON_WORD;
if ((pattern_num = look_pattern(PATTERN, PATTERN_LEN, 0)) != -1) {
checkattr = 0;
if ((pattern2_num = look_pattern(PATTERN2, PATTERN_LEN2, 0)) != -1) {
checkattr = 1;
}
state = ST_TAG;
} else if (is_wordchar(line[actual].c_str() + head)) {
state = ST_WORD;
token = head;
} else if ((latin1 = get_latin1(line[actual].c_str() + head))) {
state = ST_WORD;
token = head;
head += strlen(latin1);
} else if (line[actual][head] == '&') {
state = ST_CHAR_ENTITY;
}
break;
case ST_WORD: // wordchar
if ((latin1 = get_latin1(line[actual].c_str() + head))) {
head += strlen(latin1);
} else if ((is_wordchar((char*)APOSTROPHE) ||
(is_utf8() && is_wordchar((char*)UTF8_APOS))) &&
strncmp(line[actual].c_str() + head, ENTITY_APOS,
strlen(ENTITY_APOS)) == 0 &&
is_wordchar(line[actual].c_str() + head + strlen(ENTITY_APOS))) {
head += strlen(ENTITY_APOS) - 1;
} else if (is_utf8() &&
is_wordchar((char*)APOSTROPHE) && // add Unicode apostrophe
// to the WORDCHARS, if
// needed
strncmp(line[actual].c_str() + head, UTF8_APOS, strlen(UTF8_APOS)) ==
0 &&
is_wordchar(line[actual].c_str() + head + strlen(UTF8_APOS))) {
head += strlen(UTF8_APOS) - 1;
} else if (!is_wordchar(line[actual].c_str() + head)) {
// in-word patterns
if ((pattern3_num = look_pattern(PATTERN3, PATTERN_LEN3, 0)) != -1) {
size_t pos = line[actual].find(PATTERN3[pattern3_num][1], head);
if (pos != std::string::npos) {
size_t endpos = pos + strlen(PATTERN3[pattern3_num][1]) - 1;
if (is_wordchar(line[actual].c_str() + endpos + 1)) {
head = endpos;
break;
}
}
}
state = prevstate;
// return with the token, except in the case of in-word patterns
if (alloc_token(token, &head, t))
return true;
}
break;
case ST_TAG: // comment, labels, etc
int i;
if ((checkattr == 1) &&
((i = look_pattern(PATTERN2, PATTERN_LEN2, 1)) != -1) &&
(strcmp(PATTERN2[i][0], PATTERN2[pattern2_num][0]) == 0)) {
checkattr = 2;
} else if ((checkattr > 0) && (line[actual][head] == '>')) {
state = ST_NON_WORD;
} else if (((i = look_pattern(PATTERN, PATTERN_LEN, 1)) != -1) &&
(strcmp(PATTERN[i][1], PATTERN[pattern_num][1]) == 0)) {
state = ST_NON_WORD;
head += strlen(PATTERN[pattern_num][1]) - 1;
} else if ((strcmp(PATTERN[pattern_num][0], "<") == 0) &&
((line[actual][head] == '"') ||
(line[actual][head] == '\''))) {
quotmark = line[actual][head];
state = ST_ATTRIB;
}
break;
case ST_ATTRIB: // non word chars
prevstate = ST_ATTRIB;
if (line[actual][head] == quotmark) {
state = ST_TAG;
if (checkattr == 2)
checkattr = 1;
// for IMG ALT
} else if (is_wordchar(line[actual].c_str() + head) && (checkattr == 2)) {
state = ST_WORD;
token = head;
} else if (line[actual][head] == '&') {
state = ST_CHAR_ENTITY;
}
break;
case ST_CHAR_ENTITY: // SGML element
if ((tolower(line[actual][head]) == ';')) {
state = prevstate;
head--;
}
}
if (next_char(line[actual].c_str(), &head))
return false;
}
//FIXME No return, in function returning non-void
}
bool XMLParser::next_token(std::string& t) {
return next_token(__PATTERN__, __PATTERN_LEN__, __PATTERN2__,
__PATTERN_LEN2__, __PATTERN3__, __PATTERN_LEN3__, t);
}
// remove in-word patterns
std::string XMLParser::get_word2(
const char* PATTERN3[][2],
unsigned int PATTERN_LEN3,
const std::string &tok) {
std::string word = tok;
for (unsigned int i = 0; i < PATTERN_LEN3; i++) {
size_t pos;
while ((pos = word.find(PATTERN3[i][0])) != word.npos) {
size_t endpos = word.find(PATTERN3[i][1], pos);
if (endpos != word.npos) {
word.erase(pos, endpos + strlen(PATTERN3[i][1]) - pos);
} else
return word;
}
}
return word;
}
int XMLParser::change_token(const char* word) {
if (strstr(word, APOSTROPHE) != NULL || strchr(word, '"') != NULL ||
strchr(word, '&') != NULL || strchr(word, '<') != NULL ||
strchr(word, '>') != NULL) {
std::string r(word);
mystrrep(r, "&", "__namp;__");
mystrrep(r, "__namp;__", "&amp;");
mystrrep(r, APOSTROPHE, ENTITY_APOS);
mystrrep(r, "\"", "&quot;");
mystrrep(r, ">", "&gt;");
mystrrep(r, "<", "&lt;");
return TextParser::change_token(r.c_str());
}
return TextParser::change_token(word);
}

View File

@ -1,76 +0,0 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* Copyright (C) 2002-2017 Németh László
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef XMLPARSER_HXX_
#define XMLPARSER_HXX_
#include "textparser.hxx"
/*
* XML Parser
*
*/
class XMLParser : public TextParser {
public:
explicit XMLParser(const char* wc);
XMLParser(const w_char* wordchars, int len);
bool next_token(const char* p[][2],
unsigned int len,
const char* p2[][2],
unsigned int len2,
const char* p3[][2],
unsigned int len3,
std::string&);
virtual bool next_token(std::string&);
std::string get_word2(const char* p2[][2],
unsigned int len2,
const std::string &tok);
int change_token(const char* word);
virtual ~XMLParser();
private:
int look_pattern(const char* p[][2], unsigned int len, int column);
int pattern_num;
int pattern2_num;
int pattern3_num;
int prevstate;
int checkattr;
char quotmark;
};
#endif

View File

@ -45,19 +45,4 @@ liblyxhunspell_a_SOURCES = \
1.7.0/src/hunspell/suggestmgr.hxx \
1.7.0/src/hunspell/utf_info.hxx \
1.7.0/src/hunspell/w_char.hxx \
1.7.0/src/parsers/firstparser.cxx \
1.7.0/src/parsers/firstparser.hxx \
1.7.0/src/parsers/htmlparser.cxx \
1.7.0/src/parsers/htmlparser.hxx \
1.7.0/src/parsers/latexparser.cxx \
1.7.0/src/parsers/latexparser.hxx \
1.7.0/src/parsers/manparser.cxx \
1.7.0/src/parsers/manparser.hxx \
1.7.0/src/parsers/odfparser.cxx \
1.7.0/src/parsers/odfparser.hxx \
1.7.0/src/parsers/testparser.cxx \
1.7.0/src/parsers/textparser.cxx \
1.7.0/src/parsers/textparser.hxx \
1.7.0/src/parsers/xmlparser.cxx \
1.7.0/src/parsers/xmlparser.hxx \
1.7.0/src/win_api/config.h