Rewrite statistics code

The statistics code is known to be very slow, because it relies on
DocIterator to go through the buffer.

This commit introduces a new Statistics class that encapsulates the
main code, along a virtual method Inset::updateStatistics() that
allows to fine-tune how counting is done inset by inset.

This is a faithful bug-for-bug reimplementation.

The new code appears to be 3x faster than the old one.

See bug #12929 for a discussion about statistics update woes.
This commit is contained in:
Jean-Marc Lasgouttes 2024-07-21 22:09:28 +02:00
parent 8c3d378217
commit 6e81f31722
20 changed files with 275 additions and 176 deletions

View File

@ -56,7 +56,7 @@
#include "PDFOptions.h"
#include "Session.h"
#include "SpellChecker.h"
#include "xml.h"
#include "Statistics.h"
#include "texstream.h"
#include "TexRow.h"
#include "Text.h"
@ -66,6 +66,7 @@
#include "VCBackend.h"
#include "version.h"
#include "WordLangTuple.h"
#include "xml.h"
#include "insets/InsetBranch.h"
#include "insets/InsetInclude.h"
@ -357,10 +358,8 @@ public:
///
mutable bool need_update;
private:
int word_count_;
int char_count_;
int blank_count_;
///
Statistics statistics_;
public:
/// This is here to force the test to be done whenever parent_buffer
@ -395,22 +394,6 @@ public:
parent_buffer->invalidateBibinfoCache();
}
/// compute statistics
/// \p from initial position
/// \p to points to the end position
void updateStatistics(DocIterator & from, DocIterator & to,
bool skipNoOutput = true);
/// statistics accessor functions
int wordCount() const
{
return word_count_;
}
int charCount(bool with_blanks) const
{
return char_count_
+ (with_blanks ? blank_count_ : 0);
}
// Make sure the file monitor monitors the good file.
void refreshFileMonitor();
@ -462,8 +445,7 @@ Buffer::Impl::Impl(Buffer * owner, FileName const & file, bool readonly_,
have_bibitems_(false), lyx_clean(true), bak_clean(true), unnamed(false),
internal_buffer(false), read_only(readonly_), file_fully_loaded(false),
need_format_backup(false), ignore_parent(false), macro_lock(false),
externally_modified_(false), bibinfo_cache_valid_(false),
need_update(false), word_count_(0), char_count_(0), blank_count_(0)
externally_modified_(false), bibinfo_cache_valid_(false), need_update(false)
{
refreshFileMonitor();
if (!cloned_buffer_) {
@ -5464,83 +5446,9 @@ void Buffer::requestSpellcheck()
}
void Buffer::Impl::updateStatistics(DocIterator & from, DocIterator & to, bool skipNoOutput)
Statistics & Buffer::statistics()
{
bool inword = false;
word_count_ = 0;
char_count_ = 0;
blank_count_ = 0;
for (DocIterator dit = from ; dit != to && !dit.atEnd(); ) {
if (!dit.inTexted()) {
dit.forwardPos();
continue;
}
Paragraph const & par = dit.paragraph();
pos_type const pos = dit.pos();
// Copied and adapted from isWordSeparator() in Paragraph
if (pos == dit.lastpos()) {
inword = false;
} else {
Inset const * ins = par.getInset(pos);
if (ins && skipNoOutput && !ins->producesOutput()) {
// skip this inset
++dit.top().pos();
// stop if end of range was skipped
if (!to.atEnd() && dit >= to)
break;
continue;
} else if (!par.isDeleted(pos)) {
if (par.isWordSeparator(pos))
inword = false;
else if (!inword) {
++word_count_;
inword = true;
}
if (ins && ins->isLetter()) {
odocstringstream os;
ins->toString(os);
char_count_ += os.str().length();
}
else if (ins && ins->isSpace())
++blank_count_;
else if (ins) {
pair<int, int> words = ins->isWords();
char_count_ += words.first;
word_count_ += words.second;
inword = false;
}
else {
char_type const c = par.getChar(pos);
if (isPrintableNonspace(c))
++char_count_;
else if (isSpace(c))
++blank_count_;
}
}
}
dit.forwardPos();
}
}
void Buffer::updateStatistics(DocIterator & from, DocIterator & to, bool skipNoOutput) const
{
d->updateStatistics(from, to, skipNoOutput);
}
int Buffer::wordCount() const
{
return d->wordCount();
}
int Buffer::charCount(bool with_blanks) const
{
return d->charCount(with_blanks);
return d->statistics_;
}

View File

@ -51,6 +51,7 @@ class otexstream;
class ParagraphList;
class ParIterator;
class ParConstIterator;
class Statistics;
class TeXErrors;
class TexRow;
class TocBackend;
@ -784,15 +785,8 @@ public:
std::string includedFilePath(std::string const & name,
std::string const & ext = empty_string()) const;
/// compute statistics between \p from and \p to
/// \p from initial position
/// \p to points to the end position
/// \p skipNoOutput if notes etc. should be ignored
void updateStatistics(DocIterator & from, DocIterator & to,
bool skipNoOutput = true) const;
/// statistics accessor functions
int wordCount() const;
int charCount(bool with_blanks) const;
/// Count of words, characters and blanks
Statistics & statistics();
///
bool areChangesPresent() const;

View File

@ -38,6 +38,7 @@
#include "MetricsInfo.h"
#include "Paragraph.h"
#include "Session.h"
#include "Statistics.h"
#include "texstream.h"
#include "Text.h"
#include "TextMetrics.h"
@ -78,6 +79,7 @@
#include "support/Lexer.h"
#include "support/lstrings.h"
#include "support/lyxlib.h"
#include "support/pmprof.h"
#include "support/types.h"
#include <algorithm>
@ -2006,18 +2008,11 @@ void BufferView::dispatch(FuncRequest const & cmd, DispatchResult & dr)
}
case LFUN_STATISTICS: {
DocIterator from, to;
if (cur.selection()) {
from = cur.selectionBegin();
to = cur.selectionEnd();
} else {
from = doc_iterator_begin(&buffer_);
to = doc_iterator_end(&buffer_);
}
buffer_.updateStatistics(from, to);
int const words = buffer_.wordCount();
int const chars = buffer_.charCount(false);
int const chars_blanks = buffer_.charCount(true);
Statistics & stats = buffer_.statistics();
stats.update(cur);
int const words = stats.word_count;
int const chars = stats.char_count;
int const chars_blanks = chars + stats.blank_count;
docstring message;
if (cur.selection())
message = _("Statistics for the selection:");
@ -2040,8 +2035,8 @@ void BufferView::dispatch(FuncRequest const & cmd, DispatchResult & dr)
message += _("One character (no blanks)");
Alert::information(_("Statistics"), message);
}
break;
}
case LFUN_STATISTICS_REFERENCE_CLAMP: {
d->stats_update_trigger_ = true;
@ -2050,14 +2045,11 @@ void BufferView::dispatch(FuncRequest const & cmd, DispatchResult & dr)
break;
}
DocIterator from, to;
from = doc_iterator_begin(&buffer_);
to = doc_iterator_end(&buffer_);
buffer_.updateStatistics(from, to);
d->stats_ref_value_w_ = buffer_.wordCount();
d->stats_ref_value_c_ = buffer_.charCount(true);
d->stats_ref_value_nb_ = buffer_.charCount(false);
Statistics & stats = buffer_.statistics();
stats.update(cur);
d->stats_ref_value_w_ = stats.word_count;
d->stats_ref_value_c_ = stats.char_count; + stats.blank_count;
d->stats_ref_value_nb_ = stats.char_count;
break;
}

View File

@ -173,6 +173,7 @@ SOURCEFILESCORE = \
ServerSocket.cpp \
Session.cpp \
Spacing.cpp \
Statistics.cpp \
TexRow.cpp \
texstream.cpp \
Text.cpp \
@ -277,6 +278,7 @@ HEADERFILESCORE = \
Session.h \
Spacing.h \
SpellChecker.h \
Statistics.h \
TexRow.h \
texstream.h \
Text.h \

120
src/Statistics.cpp Normal file
View File

@ -0,0 +1,120 @@
// -*- C++ -*-
/**
* \file Statistics.cpp
* This file is part of LyX, the document processor.
* Licence details can be found in the file COPYING.
*
* \author Jean-Marc Lasgouttes
*
* Full author contact details are available in file CREDITS.
*/
#include <config.h>
#include "Statistics.h"
#include "Paragraph.h"
#include "Text.h"
#include "Cursor.h"
#include "support/lassert.h"
#include "support/lstrings.h"
#include "support/textutils.h"
namespace lyx {
using namespace support;
void Statistics::update(CursorData const & cur)
{
// reset counts
*this = Statistics();
if (cur.selection()) {
if (cur.inMathed())
return;
CursorSlice from, to;
from = cur.selBegin();
to = cur.selEnd();
update(from, to);
} else
update(*cur.bottom().text());
}
void Statistics::update(docstring const & s)
{
// FIXME: use a stripped-down version of the paragraph code.
// This is the original code from InsetCitation::isWords()
char_count += s.size();
// FIXME: this does not count words properly
word_count += wordCount(s);
// FIXME: spaces are not counted
}
void Statistics::update(Text const & text)
{
for (Paragraph const & par : text.paragraphs())
update(par);
}
void Statistics::update(CursorSlice const & from, CursorSlice & to)
{
LASSERT(from.text() == to.text(), return);
if (from.idx() == to.idx()) {
if (from.pit() == to.pit()) {
update(from.paragraph(), from.pos(), to.pos());
} else {
pos_type frompos = from.pos();
for (pit_type pit = from.pit() ; pit < to.pit() ; ++pit) {
update(from.text()->getPar(pit), frompos);
frompos = 0;
}
update(to.paragraph(), 0, to.pos());
}
} else
for (idx_type idx = from.idx() ; idx <= to.idx(); ++idx)
update(*from.inset().getText(idx));
}
void Statistics::update(Paragraph const & par, pos_type from, pos_type to)
{
if (to == -1)
to = par.size();
for (pos_type pos = from ; pos < to ; ++pos) {
Inset const * ins = par.isInset(pos) ? par.getInset(pos) : nullptr;
// Stuff that we skip
if (par.isDeleted(pos))
continue;
if (ins && skip_no_output && !ins->producesOutput())
continue;
// words
if (par.isWordSeparator(pos))
inword = false;
else if (!inword) {
++word_count;
inword = true;
}
if (ins)
ins->updateStatistics(*this);
else {
char_type const c = par.getChar(pos);
if (isPrintableNonspace(c))
++char_count;
else if (lyx::isSpace(c))
++blank_count;
}
}
inword = false;
}
} // namespace lyx

64
src/Statistics.h Normal file
View File

@ -0,0 +1,64 @@
// -*- C++ -*-
/**
* \file Statistics.h
* This file is part of LyX, the document processor.
* Licence details can be found in the file COPYING.
*
* \author Jean-Marc Lasgouttes
*
* Full author contact details are available in file CREDITS.
*/
#ifndef STATISTICS_H
#define STATISTICS_H
#include "support/strfwd.h"
#include "support/types.h"
namespace lyx {
class CursorData;
class CursorSlice;
class Text;
class Paragraph;
// Class used to compute letters/words statistics on buffer or selection
class Statistics {
public:
// Number of words
int word_count = 0;
// Number of non blank characters
int char_count = 0;
// Number of blank characters
int blank_count = 0;
// Indicate whether parts that are not output should be counted.
bool skip_no_output = true;
/// Count characters in the whole document, or in the selection if
/// there is one. This is the main entry point.
void update(CursorData const & cur);
/// Count chars and words in this string
void update(docstring const & s);
/// Count chars and words in the paragraphs of \c text
void update(Text const & text);
private:
/// Count chars and words between two positions
void update(CursorSlice const & from, CursorSlice & to);
/** Count chars and words in a paragraph
* \param par: the paragraph
* \param from: starting position
* \param to: end position. If it is equal to -1, then the end is
* the end of the paragraph.
*/
void update(Paragraph const & par, pos_type from = 0, pos_type to = -1);
// Used in the code to track status
bool inword = false;
};
}
#endif // STATISTICS_H

View File

@ -52,6 +52,7 @@
#include "Paragraph.h"
#include "ParagraphParameters.h"
#include "SpellChecker.h"
#include "Statistics.h"
#include "TextClass.h"
#include "TextMetrics.h"
#include "Thesaurus.h"
@ -6328,27 +6329,20 @@ void Text::dispatch(Cursor & cur, FuncRequest & cmd)
break;
case LFUN_SERVER_GET_STATISTICS: {
DocIterator from, to;
if (cur.selection()) {
from = cur.selectionBegin();
to = cur.selectionEnd();
} else {
from = doc_iterator_begin(cur.buffer());
to = doc_iterator_end(cur.buffer());
}
Statistics & stats = cur.buffer()->statistics();
stats.update(cur);
cur.buffer()->updateStatistics(from, to);
string const arg0 = cmd.getArg(0);
if (arg0 == "words") {
cur.message(convert<docstring>(cur.buffer()->wordCount()));
cur.message(convert<docstring>(stats.word_count));
} else if (arg0 == "chars") {
cur.message(convert<docstring>(cur.buffer()->charCount(false)));
cur.message(convert<docstring>(stats.char_count));
} else if (arg0 == "chars-space") {
cur.message(convert<docstring>(cur.buffer()->charCount(true)));
cur.message(convert<docstring>(stats.char_count + stats.blank_count));
} else {
cur.message(convert<docstring>(cur.buffer()->wordCount()) + " "
+ convert<docstring>(cur.buffer()->charCount(false)) + " "
+ convert<docstring>(cur.buffer()->charCount(true)));
cur.message(convert<docstring>(stats.word_count) + " "
+ convert<docstring>(stats.char_count) + " "
+ convert<docstring>(stats.char_count + stats.blank_count));
}
break;
}

View File

@ -60,8 +60,9 @@
#include "LyXRC.h"
#include "LyXVC.h"
#include "Paragraph.h"
#include "SpellChecker.h"
#include "Session.h"
#include "SpellChecker.h"
#include "Statistics.h"
#include "TexRow.h"
#include "Text.h"
#include "Toolbars.h"
@ -1478,31 +1479,21 @@ void GuiView::showStats()
if (d.time_to_update > 0)
return;
DocIterator from, to;
if (cur.selection()) {
from = cur.selectionBegin();
to = cur.selectionEnd();
d.already_in_selection_ = true;
} else {
from = doc_iterator_begin(buf);
to = doc_iterator_end(buf);
d.already_in_selection_ = false;
}
// Don't attempt to calculate stats if
// the buffer is busy as this might crash (#12935)
Statistics & statistics = buf->statistics();
if (!busy() && !bv->busy())
buf->updateStatistics(from, to);
statistics.update(cur);
QStringList stats;
if (word_count_enabled_) {
int const words = buf->wordCount() - bv->stats_ref_value_w();
int const words = statistics.word_count - bv->stats_ref_value_w();
if (words == 1)
stats << toqstr(bformat(_("%1$d Word"), words));
else
stats << toqstr(bformat(_("%1$d Words"), words));
}
int const chars_with_blanks = buf->charCount(true);
int const chars_with_blanks = statistics.char_count + statistics.blank_count;
if (char_count_enabled_) {
int const chars_with_blanks_disp = chars_with_blanks - bv->stats_ref_value_c();
if (chars_with_blanks == 1)
@ -1511,7 +1502,7 @@ void GuiView::showStats()
stats << toqstr(bformat(_("%1$d Characters"), chars_with_blanks_disp));
}
if (char_nb_count_enabled_) {
int const chars = buf->charCount(false) - bv->stats_ref_value_nb();
int const chars = statistics.char_count - bv->stats_ref_value_nb();
if (chars == 1)
stats << toqstr(bformat(_("%1$d Character (no Blanks)"), chars));
else

View File

@ -30,10 +30,11 @@
#include "InsetLayout.h"
#include "MetricsInfo.h"
#include "output_xhtml.h"
#include "xml.h"
#include "Statistics.h"
#include "Text.h"
#include "TextClass.h"
#include "TocBackend.h"
#include "xml.h"
#include "frontends/Application.h"
#include "frontends/Painter.h"
@ -626,6 +627,17 @@ bool Inset::undefined() const
}
void Inset::updateStatistics(Statistics & stats) const
{
if (isLetter()) {
odocstringstream os;
toString(os);
stats.char_count += os.str().length();
} else if (isSpace())
++stats.blank_count;
}
CtObject Inset::getCtObject(OutputParams const &) const
{
return CtObject::Normal;

View File

@ -60,6 +60,7 @@ class MathAtom;
class MetricsInfo;
class PainterInfo;
class ParIterator;
class Statistics;
class Text;
class TocBackend;
class XMLStream;
@ -480,9 +481,8 @@ public:
/// is this equivalent to a space (which is BTW different from
/// a line separator)?
virtual bool isSpace() const { return false; }
/// returns chars, words if the inset is equivalent to such, otherwise
/// (0,0), which should be interpreted as 'false'
virtual std::pair<int, int> isWords() const { return std::pair<int,int>(0, 0); }
/// Count words, characters and spaces in inset
virtual void updateStatistics(Statistics & stats) const;
/// does this inset try to use all available space (like \\hfill does)?
virtual bool isHfill() const { return false; }

View File

@ -29,6 +29,7 @@
#include "output_xhtml.h"
#include "output_docbook.h"
#include "ParIterator.h"
#include "Statistics.h"
#include "texstream.h"
#include "TocBackend.h"
@ -791,10 +792,9 @@ void InsetCitation::latex(otexstream & os, OutputParams const & runparams) const
}
pair<int, int> InsetCitation::isWords() const
void InsetCitation::updateStatistics(Statistics & stats) const
{
docstring const label = generateLabel(false);
return pair<int, int>(label.size(), wordCount(label));
stats.update(generateLabel(false));
}

View File

@ -104,7 +104,7 @@ public:
/// search and open citation source
void openCitation();
///
std::pair<int, int> isWords() const override;
void updateStatistics(Statistics & stats) const override;
private:
/// tries to make a pretty label and makes a basic one if not

View File

@ -20,8 +20,9 @@
#include "LyX.h"
#include "output_docbook.h"
#include "output_xhtml.h"
#include "xml.h"
#include "Statistics.h"
#include "texstream.h"
#include "xml.h"
#include "support/debug.h"
#include "support/docstream.h"
@ -299,10 +300,9 @@ void InsetHyperlink::validate(LaTeXFeatures & features) const
}
pair<int, int> InsetHyperlink::isWords() const
void InsetHyperlink::updateStatistics(Statistics & stats) const
{
docstring const label = getParam("name");
return pair<int, int>(label.size(), wordCount(label));
stats.update(getParam("name"));
}

View File

@ -55,7 +55,7 @@ public:
///
docstring xhtml(XMLStream &, OutputParams const &) const override;
///
std::pair<int, int> isWords() const override;
void updateStatistics(Statistics & stats) const override;
//@}
/// \name Static public methods obligated for InsetCommand derived classes

View File

@ -28,6 +28,7 @@
#include "MetricsInfo.h"
#include "Paragraph.h"
#include "ParIterator.h"
#include "Statistics.h"
#include "texstream.h"
#include "xml.h"
@ -1032,7 +1033,7 @@ string InsetQuotes::contextMenuName() const
}
pair<int, int> InsetQuotes::isWords() const
void InsetQuotes::updateStatistics(Statistics & stats) const
{
int length = 1;
// In PassThru, we use straight quotes otherwise we need to check for French
@ -1052,7 +1053,7 @@ pair<int, int> InsetQuotes::isWords() const
}
//one or two characters from the statistics perspective
return std::pair<int,int>(length, 0);
stats.char_count += length;
}
} // namespace lyx

View File

@ -178,7 +178,7 @@ public:
/// Returns the current quote type
std::string getType() const;
///
std::pair<int, int> isWords() const override;
void updateStatistics(Statistics & stats) const override;
private:
///

View File

@ -45,6 +45,7 @@
#include "output_xhtml.h"
#include "Paragraph.h"
#include "ParIterator.h"
#include "Statistics.h"
#include "TexRow.h"
#include "texstream.h"
#include "TextClass.h"
@ -5017,6 +5018,13 @@ void InsetTabular::updateBuffer(ParIterator const & it, UpdateType utype, bool c
}
void InsetTabular::updateStatistics(Statistics & stats) const
{
for (idx_type idx = 0 ; idx < nargs(); ++idx)
stats.update(*getText(idx));
}
void InsetTabular::addToToc(DocIterator const & cpit, bool output_active,
UpdateType utype, TocBackend & backend) const
{

View File

@ -1113,9 +1113,12 @@ public:
Inset * editXY(Cursor & cur, int x, int y) override;
/// can we go further down on mouse click?
bool descendable(BufferView const &) const override { return true; }
/// Update the counters of this inset and of its contents
void updateBuffer(ParIterator const &, UpdateType, bool const deleted = false) override;
///
void updateStatistics(Statistics & stats) const;
///
void addToToc(DocIterator const & di, bool output_active,
UpdateType utype, TocBackend & backend) const override;

View File

@ -50,6 +50,7 @@
#include "Paragraph.h"
#include "ParagraphParameters.h"
#include "ParIterator.h"
#include "Statistics.h"
#include "TexRow.h"
#include "texstream.h"
#include "TextClass.h"
@ -1141,6 +1142,12 @@ void InsetText::updateBuffer(ParIterator const & it, UpdateType utype, bool cons
}
void InsetText::updateStatistics(Statistics & stats) const
{
stats.update(text());
}
void InsetText::toString(odocstream & os) const
{
os << text().asString(0, 1, AS_STR_LABEL | AS_STR_INSETS);

View File

@ -177,6 +177,9 @@ public:
/// Update the counters of this inset and of its contents
void updateBuffer(ParIterator const &, UpdateType, bool const deleted = false) override;
///
void updateStatistics(Statistics & stats) const override;
///
void setMacrocontextPositionRecursive(DocIterator const & pos);
///