Compare feature take two.

With this patch, we can recursively walk through the document. The document is split around a middle snake and each part is processed further. The snakes are added to the output document as well as the added and deleted parts. Now, the only thing left is to commit the implementation of the find_middle_snake function. git-svn-id: svn://svn.lyx.org/lyx/lyx-devel/trunk@32591 a592a061-630c-0410-9148-cb99ea01b6c8
2024-11-27 03:36:39 +00:00 · 2009-12-19 18:15:20 +00:00 · 2009-12-19 18:15:20 +00:00 · 28dab69a0e
commit 28dab69a0e
parent 04580d2122
1 changed files with 512 additions and 9 deletions
--- a/src/Compare.cpp
+++ b/src/Compare.cpp
@ -14,7 +14,13 @@

 #include "Buffer.h"
 #include "BufferParams.h"
+#include "Changes.h"

+#include "insets/InsetText.h"
+
+#include "support/lassert.h"	
+
+#include <boost/next_prior.hpp>

 using namespace std;
 using namespace lyx::support;
@ -22,6 +28,155 @@ using namespace lyx::support;

 namespace lyx {

+void step_forward(DocIterator & dit)
+{
+	dit.top().forwardPos();
+}
+
+
+void step_backward(DocIterator & dit)
+{
+	dit.top().backwardPos();
+}
+
+
+bool step_forward(DocIterator & dit, DocIterator const & end)
+{
+	if (dit == end)
+		return false;
+	step_forward(dit);
+	return true;
+}
+
+
+bool step_backward(DocIterator & dit, DocIterator const & beg)
+{
+	if (dit == beg)
+		return false;
+	step_backward(dit);
+	return true;
+}
+
+/**
+ * A pair of two DocIterators that form a range.
+ */
+class DocRange {
+public:
+	DocRange(DocIterator from_, DocIterator to_)
+		: from(from_), to(to_)
+	{}
+
+	DocRange(Buffer const * buf)
+	{
+		from = doc_iterator_begin(buf);
+		to = doc_iterator_end(buf);
+		to.backwardPos();
+	}
+
+	///
+	Text * text() const { return from.text(); }
+	///
+	bool empty() const { return to <= from;	}
+	///
+	size_t length() const;
+
+	/// The begin of the range
+	DocIterator from;
+	/// The end of the range
+	DocIterator to;
+};
+
+
+size_t DocRange::length() const
+{
+	pit_type startpit = from.pit();
+	pit_type endpit = to.pit();
+	ParagraphList const & ps_ = from.text()->paragraphs();
+
+	ParagraphList pars(boost::next(ps_.begin(), startpit),
+				boost::next(ps_.begin(), endpit + 1));
+
+	// Remove the end of the last paragraph; afterwards, remove the
+	// beginning of the first paragraph.
+	Paragraph & back = pars.back();
+	back.eraseChars(to.pos(), back.size(), false);
+	Paragraph & front = pars.front();
+	front.eraseChars(0, from.pos(), false);
+
+	ParagraphList::const_iterator pit = pars.begin();
+	ParagraphList::const_iterator end_it = pars.end();
+
+	size_t length = 0;
+	for (; pit != end_it; ++pit)
+		length += pit->size() + 1;
+
+	// The last paragraph has no paragraph-end
+	--length;
+	return length;	
+}
+
+
+class DocPair {
+public:
+	DocPair() {}
+
+	DocPair(DocIterator o_, DocIterator n_)
+		: o(o_), n(n_)
+	{}
+
+	DocPair & operator++()
+	{
+		step_forward(o);
+		step_forward(n);
+		return *this;
+	}
+	///
+	DocIterator o;
+	///
+	DocIterator n;
+};
+	
+/**
+ * A pair of two DocRanges.
+ */
+class DocRangePair {
+public:
+	DocRangePair(DocRange o_, DocRange n_)
+		: o(o_), n(n_)
+	{}
+	
+	DocRangePair(DocPair from, DocPair to)
+		: o(from.o, to.o), n(from.n, to.n)
+	{}
+
+	DocRangePair(Buffer const * o_buf, Buffer const * n_buf)
+		: o(o_buf), n(n_buf)
+	{}
+
+	/// Returns the from pair
+	DocPair from() const { return DocPair(o.from, n.from); }
+
+	/// Returns the to pair
+	DocPair to() const { return DocPair(o.to, n.to); }
+
+	DocRange o;
+	DocRange n;
+};
+
+
+DocRangePair stepIntoInset(DocPair const & inset_location)
+{
+	DocRangePair rp(inset_location, inset_location);
+	rp.o.from.forwardPos();
+	rp.n.from.forwardPos();
+	step_forward(rp.o.to);
+	step_forward(rp.n.to);
+	rp.o.to.backwardPos();
+	rp.n.to.backwardPos();
+	return rp;
+}
+
+
 /**
 * The implementation of the algorithm that does the comparison
 * between two documents.
@ -36,14 +191,74 @@ public:
 	///
 	~Impl() {}

-	/// Set to true to abort the algorithm
+	// Algorithm to find the shortest edit string. This algorithm 
+	// only needs a linear amount of memory (linear with the sum
+	// of the number of characters in the two paragraph-lists).
+	bool diff(Buffer const * new_buf, Buffer const * old_buf,
+		Buffer const * dest_buf);
+
+	/// Set to true to cancel the algorithm
 	bool abort_;

 private:
+	// Finds the middle snake and returns the length of the
+	// shortest edit script.
+	int find_middle_snake(DocRangePair const & rp, DocPair & middle_snake);
+
+	// This function is called recursively by a divide and conquer
+	// algorithm. Each time, the string is divided into two split
+	// around the middle snake.
+	void diff_i(DocRangePair const & rp);
+
+	/// Processes the splitted chunks. It either adds them as deleted,
+	/// as added, or call diff_i for further processing.
+	void diff_part(DocRangePair const & rp);
+
+	/// Runs the algorithm for the inset located at /c it and /c it_n 
+	/// and adds the result to /c pars.
+	void diff_inset(Inset * inset, DocPair const & p);
+
+	// Adds the snake to the destination buffer. The algorithm will
+	// recursively be applied to any InsetTexts that are within the snake.
+	void process_snake(DocRangePair const & rp);
+
+	/// Writes the range to the destination buffer
+	void writeToDestBuffer(DocRange const & range,
+		Change::Type type = Change::UNCHANGED);
+	
+	/// Writes the paragraph list to the destination buffer
+	void writeToDestBuffer(ParagraphList const & copy_pars) const;
+
+	/// The length of the first chunk currently processed
+	int N;
+	/// The length of the second chunk currently processed
+	int M;
+
 	/// The thread object, used to emit signals to the GUI
 	Compare const & compare_;
+
+	/// The buffer containing text that will be marked as old
+	Buffer const * old_buf_;
+	/// The buffer containing text that will be marked as new
+	Buffer const * new_buf_;
+	/// The buffer containing text that will be marked as new
+	Buffer const * dest_buf_;
+
+	/// The paragraph list of the destination buffer
+	ParagraphList * dest_pars_;
+
+	/// The level of recursion
+	int recursion_level_;
+
+	/// The number of nested insets at this level
+	int nested_inset_level_;
 };

+/////////////////////////////////////////////////////////////////////
+//
+// Compare
+//
+/////////////////////////////////////////////////////////////////////

 Compare::Compare(Buffer const * new_buf, Buffer const * old_buf,
 	Buffer * const dest_buf, CompareOptions const & options)
@ -55,10 +270,8 @@ Compare::Compare(Buffer const * new_buf, Buffer const * old_buf,

 void Compare::run()
 {
-	if (!dest_buffer || !new_buffer || !old_buffer) {
-		error();
+	if (!dest_buffer || !new_buffer || !old_buffer)
 		return;
-	}

 	// Copy the buffer params to the new buffer
 	dest_buffer->params() = options_.settings_from_new
@ -66,13 +279,19 @@ void Compare::run()
 	
 	// do the real work
 	if (!doCompare())
-		error();
-	else
-		finished(pimpl_->abort_);
+		return;
+	
+	finished(pimpl_->abort_);
 	return;
 }


+int Compare::doCompare()
+{
+	return pimpl_->diff(new_buffer, old_buffer, dest_buffer);
+}
+
+
 void Compare::abort()
 {
 	pimpl_->abort_ = true;
@ -82,9 +301,293 @@ void Compare::abort()
 }


-int Compare::doCompare()
+void get_paragraph_list(DocRange const & range,
+	ParagraphList & pars)
 {
-	return 0;
+	// Clone the paragraphs within the selection.
+	pit_type startpit = range.from.pit();
+	pit_type endpit = range.to.pit();
+	ParagraphList const & ps_ = range.text()->paragraphs();
+	ParagraphList tmp_pars(boost::next(ps_.begin(), startpit),
+		boost::next(ps_.begin(), endpit + 1));
+
+	// Remove the end of the last paragraph; afterwards, remove the
+	// beginning of the first paragraph. Keep this order - there may only
+	// be one paragraph!
+	Paragraph & back = tmp_pars.back();
+	back.eraseChars(range.to.pos(), back.size(), false);
+	Paragraph & front = tmp_pars.front();
+	front.eraseChars(0, range.from.pos(), false);
+
+	pars.insert(pars.begin(), tmp_pars.begin(), tmp_pars.end());
+}
+
+
+bool equal(Inset const * i_o, Inset const * i_n)
+{
+	if (!i_o || !i_n)
+		return false;
+
+	// Different types of insets
+	if (i_o->lyxCode() != i_n->lyxCode())
+		return false;
+
+	// Editable insets are assumed to be the same as they are of the 
+	// same type. If we later on decide that we insert them in the
+	// document as being unchanged, we will run the algorithm on the
+	// contents of the two insets.
+	// FIXME: This fails if the parameters of the insets differ.
+	// FIXME: We do not recurse into InsetTabulars.
+	// FIXME: We need methods inset->equivalent(inset).
+	if (i_o->editable() && !i_o->asInsetMath()
+		  && i_o->asInsetText())
+		return true;
+
+	ostringstream o_os;
+	ostringstream n_os;
+	i_o->write(o_os);
+	i_n->write(n_os);
+	return o_os.str() == n_os.str();
+}
+
+
+bool equal(DocIterator & o, DocIterator & n) {
+	Paragraph const & old_par = o.text()->getPar(o.pit());
+	Paragraph const & new_par = n.text()->getPar(n.pit());
+
+	Inset const * i_o = old_par.getInset(o.pos());
+	Inset const * i_n = new_par.getInset(n.pos());
+
+	if (i_o && i_n)
+		return equal(i_o, i_n);
+	
+	char_type c_o = old_par.getChar(o.pos());
+	char_type c_n = new_par.getChar(n.pos());
+	Font fo = old_par.getFontSettings(o.buffer()->params(), o.pos());
+	Font fn = new_par.getFontSettings(n.buffer()->params(), n.pos());
+	return c_o == c_n && fo == fn;
+}
+
+
+void traverse_snake_back(DocRangePair & rp)
+{
+	while (true) {
+		// Traverse snake
+		if (!step_backward(rp.o.to, rp.o.from))
+			break;
+
+		if (!step_backward(rp.n.to, rp.n.from)) {
+			step_forward(rp.o.to);
+			break;
+		}
+
+		if (!equal(rp.o.to, rp.n.to)) {
+			step_forward(rp.o.to);
+			step_forward(rp.n.to);
+			break;
+		}
+	}
+}
+
+
+void traverse_snake_forw(DocRangePair & rp)
+{
+	while (equal(rp.o.from, rp.n.from)) {
+		if (!step_forward(rp.o.from, rp.o.to))
+			break;
+
+		if (!step_forward(rp.n.from, rp.n.to)) {
+			step_backward(rp.o.from);
+			break;
+		}
+	}
+}
+
+/////////////////////////////////////////////////////////////////////
+//
+// Compare::Impl
+//
+/////////////////////////////////////////////////////////////////////
+
+int Compare::Impl::find_middle_snake(DocRangePair const & rp,
+	DocPair & middle_snake)
+{
+	N = rp.o.length();
+	M = rp.n.length();
+	return M+N;
+}
+
+
+bool Compare::Impl::diff(Buffer const * new_buf, Buffer const * old_buf,
+	Buffer const * dest_buf)
+{
+	if (!new_buf || !old_buf || !dest_buf)
+		return false;
+
+	old_buf_ = old_buf;
+	new_buf_ = new_buf;
+	dest_buf_ = dest_buf;
+	dest_pars_ = &dest_buf->inset().asInsetText()->paragraphs();
+	dest_pars_->clear();
+
+	recursion_level_ = 0;
+	nested_inset_level_ = 0;
+
+	DocRangePair rp(old_buf_, new_buf_);
+
+	DocPair from = rp.from();
+	traverse_snake_forw(rp);
+	DocRangePair const snake(from, rp.from());
+	process_snake(snake);
+	
+	// Start the recursive algorithm
+	diff_i(rp);
+
+	for (pit_type p = 0; p < (pit_type)dest_pars_->size(); ++p) {
+		(*dest_pars_)[p].setBuffer(const_cast<Buffer &>(*dest_buf));
+		(*dest_pars_)[p].setInsetOwner(&dest_buf_->inset());
+	}
+
+	return true;
+}
+
+
+void Compare::Impl::diff_i(DocRangePair const & rp)
+{
+	// The middle snake
+	DocPair middle_snake;
+
+	// Divides the problem into two smaller problems, split around
+	// the snake in the middle.
+	int const L_ses = find_middle_snake(rp, middle_snake);
+
+	// Set maximum of progress bar
+	if (++recursion_level_ == 1)
+		compare_.progressMax(L_ses);
+
+	// There are now three possibilities: the strings were the same,
+	// the strings were completely different, or we found a middle
+	// snake and we can split the string into two parts to process.
+	if (L_ses == 0)
+		// Two the same strings (this must be a very rare case, because
+		// usually this will be part of a snake adjacent to these strings).
+		writeToDestBuffer(rp.o);
+
+	else if (middle_snake.o.empty()) {
+		// Two totally different strings
+		writeToDestBuffer(rp.o, Change::DELETED);
+		writeToDestBuffer(rp.n, Change::INSERTED);
+
+	} else {
+		// Retrieve the complete snake
+		DocRangePair first_part(rp.from(), middle_snake);
+		traverse_snake_back(first_part);
+			
+		DocRangePair second_part(middle_snake, rp.to());
+		traverse_snake_forw(second_part);
+			
+		// Split the string in three parts:
+		// 1. in front of the snake
+		diff_part(first_part);
+
+		// 2. the snake itself, and
+		DocRangePair const snake(first_part.to(), second_part.from());
+		process_snake(snake);
+
+		// 3. behind the snake.
+		diff_part(second_part);
+	}
+	--recursion_level_;
+}
+
+
+void Compare::Impl::diff_part(DocRangePair const & rp)
+{
+	// Is there a finite length string in both buffers, if not there
+	// is an empty string and we write the other one to the buffer.
+	if (!rp.o.empty() && !rp.n.empty())
+		diff_i(rp);
+	
+	else if (!rp.o.empty())
+		writeToDestBuffer(rp.o, Change::DELETED);
+
+	else if (!rp.n.empty())
+		writeToDestBuffer(rp.n, Change::INSERTED);
+}
+
+
+void Compare::Impl::diff_inset(Inset * inset, DocPair const & p)
+{
+	// Find the dociterators for the beginning and the
+	// end of the inset, for the old and new document.
+	DocRangePair const rp = stepIntoInset(p);
+
+	// Recurse into the inset. Temporarily replace the dest_pars
+	// paragraph list by the paragraph list of the nested inset.
+	ParagraphList * backup_dest_pars = dest_pars_;
+	dest_pars_ = &inset->asInsetText()->text().paragraphs();
+	dest_pars_->clear();
+	
+	++nested_inset_level_;
+	diff_i(rp);
+	--nested_inset_level_;
+
+	dest_pars_ = backup_dest_pars;
+}
+
+
+void Compare::Impl::process_snake(DocRangePair const & rp)
+{
+	ParagraphList pars;
+	get_paragraph_list(rp.o, pars);
+
+	// Find insets in this paragaph list
+	DocPair it = rp.from();
+	for (; it.o < rp.o.to; ++it) {
+		Inset * inset = it.o.text()->getPar(it.o.pit()).getInset(it.o.pos());
+		if (inset && inset->editable() && inset->asInsetText()) {
+			// Find the inset in the paragraph list that will be pasted into
+			// the final document. The contents of the inset will be replaced
+			// by the output of the algorithm below.
+			pit_type const pit = it.o.pit() - rp.o.from.pit();
+			pos_type const pos = pit ? it.o.pos() : it.o.pos() - rp.o.from.pos();
+			inset = pars[pit].getInset(pos);
+			LASSERT(inset, /**/);
+			diff_inset(inset, it);
+		}
+	}
+	writeToDestBuffer(pars);
+}
+
+
+void Compare::Impl::writeToDestBuffer(DocRange const & range,
+	Change::Type type)
+{
+	ParagraphList pars;
+	get_paragraph_list(range, pars);
+
+	pos_type size = 0;
+
+	// Set the change
+	ParagraphList::iterator it = pars.begin();
+	for (; it != pars.end(); ++it) {
+		it->setChange(Change(type));
+		size += it->size();
+	}
+
+	writeToDestBuffer(pars);
+
+	if (nested_inset_level_ == 0)
+		compare_.progress(size);
+}
+
+
+void Compare::Impl::writeToDestBuffer(ParagraphList const & pars) const
+{
+	pit_type const pit = dest_pars_->size() - 1;
+	dest_pars_->insert(dest_pars_->end(), pars.begin(), pars.end());
+	if (pit >= 0)
+		mergeParagraph(dest_buf_->params(), *dest_pars_, pit);
 }