1999-09-27 18:44:28 +00:00
|
|
|
# This file is part of reLyX
|
|
|
|
# Copyright (c) 1998-9 Amir Karger karger@post.harvard.edu
|
|
|
|
# You are free to use and modify this code under the terms of
|
|
|
|
# the GNU General Public Licence version 2 or later.
|
|
|
|
|
|
|
|
package CleanTeX;
|
|
|
|
# This package prepares a LaTeX file for translation to LyX
|
|
|
|
# - Translates some local commands (e.g., {\em blah} to {\emph{blah}})
|
|
|
|
# - Prepares math mode stuff for LyX. LyX reads LaTeX math mode directly,
|
|
|
|
# so reLyX can basically copy all math mode exactly, but LyX is a
|
|
|
|
# bit stricter than LaTeX. E.g., translate 'x^2' -> 'x^{2}
|
|
|
|
# - Removes optional arguments if LyX doesn't understand them, e.g. \\
|
|
|
|
|
|
|
|
use strict;
|
|
|
|
|
|
|
|
use Verbatim;
|
|
|
|
|
|
|
|
######
|
|
|
|
# Global variables
|
|
|
|
my $last_eaten; # last token we ate
|
|
|
|
|
|
|
|
# List of commands for which LyX doesn't support the optional argument
|
|
|
|
my @DeleteOptArg = map {"\\$_"} qw(\\ \\*
|
|
|
|
chapter section subsection subsubsection paragraph subparagraph
|
|
|
|
);
|
|
|
|
|
|
|
|
my $debug_on; # was -d option given?
|
|
|
|
|
|
|
|
######################### PARSER INVOCATION ################################
|
|
|
|
sub call_parser {
|
|
|
|
# This subroutine opens the TeX parser and processes the file.
|
|
|
|
# Arg0 is the name of the input TeX file
|
|
|
|
# Arg1 is the name of the output "clean" file
|
|
|
|
|
|
|
|
my ($InFileName, $OutFileName) = (shift,shift);
|
|
|
|
|
|
|
|
$debug_on = (defined($main::opt_d) && $main::opt_d);
|
|
|
|
my $zzz=$debug_on ? " TeX file ($InFileName --> $OutFileName)\n" :"... ";
|
|
|
|
print STDERR "Cleaning$zzz";
|
|
|
|
open (OUTFILE, ">$OutFileName") or die "problem opening $OutFileName: $!\n";
|
|
|
|
|
|
|
|
# Create the list of tokens for the parser
|
|
|
|
# Parts of the token list are swiped from TeX.pm
|
|
|
|
my %MyTokens = ( '{' => $Text::TeX::Tokens{'{'},
|
|
|
|
'}' => $Text::TeX::Tokens{'}'},
|
|
|
|
'$' => $Text::TeX::Tokens{'$'},
|
|
|
|
'$$' => $Text::TeX::Tokens{'$$'},
|
|
|
|
'\begin' => $Text::TeX::Tokens{'\begin'},
|
|
|
|
'\end' => $Text::TeX::Tokens{'\end'},
|
|
|
|
);
|
|
|
|
|
|
|
|
# Put local tokens, like \em, into %MyTokens
|
|
|
|
#Note: \cal is "local", although it's found in math mode
|
|
|
|
# (The "map" just puts a backslash in front of each word in the list)
|
|
|
|
my @LocalTokens = qw (em rm bf tt sf sc sl it
|
|
|
|
rmfamily ttfamily sffamily mdseries bfseries
|
|
|
|
upshape itshape slshape scshape cal
|
2004-10-28 14:35:53 +00:00
|
|
|
);
|
1999-09-27 18:44:28 +00:00
|
|
|
foreach (@LocalTokens) {
|
|
|
|
$MyTokens{"\\$_"} = $Text::TeX::Tokens{'\em'}
|
|
|
|
}
|
|
|
|
# Now add any commands
|
|
|
|
&ReadCommands::Merge(\%MyTokens);
|
|
|
|
|
|
|
|
# Create the fileobject
|
2004-10-28 14:35:53 +00:00
|
|
|
my $file = new Text::TeX::OpenFile
|
1999-09-27 18:44:28 +00:00
|
|
|
$InFileName,
|
|
|
|
'defaultact' => \&clean_tex,
|
|
|
|
'tokens' => \%MyTokens;
|
|
|
|
|
|
|
|
# Now actually process the file
|
|
|
|
$file->process;
|
|
|
|
close OUTFILE;
|
|
|
|
#warn "Done cleaning TeX file\n";
|
|
|
|
} # end sub call_parser
|
|
|
|
|
|
|
|
|
|
|
|
####################### MAIN TRANSLATING SUBROUTINE ########################
|
|
|
|
# Routine called by the TeX-parser to perform token-processing.
|
|
|
|
sub clean_tex {
|
|
|
|
my($eaten,$txt) = (shift,shift);
|
|
|
|
my ($outstr, $type);
|
|
|
|
|
|
|
|
# Translation table for TT::Token tokens whose translations should
|
|
|
|
# NOT have whitespace after them! See sub translate...
|
|
|
|
# Note that tokens of type TT::EndLocal are always translated to '}'. So,
|
|
|
|
# any token defined as a local token *must* be translated to something
|
|
|
|
# with a '{' (e.g., '\em' -> '\emph{') or we'll have mismatched braces
|
|
|
|
my %no_ws_transtbl = (
|
|
|
|
'\em' => '\emph{',
|
|
|
|
'\rm' => '\textrm{',
|
|
|
|
'\bf' => '\textbf{',
|
|
|
|
'\tt' => '\texttt{',
|
|
|
|
'\sf' => '\textsf{',
|
|
|
|
'\sc' => '\textsc{',
|
|
|
|
'\sl' => '\textsl{',
|
|
|
|
'\it' => '\textit{',
|
|
|
|
'\rmfamily' => '\textrm{',
|
|
|
|
'\ttfamily' => '\texttt{',
|
|
|
|
'\sffamily' => '\textsf{',
|
|
|
|
'\mdseries' => '\textmd{',
|
|
|
|
'\bfseries' => '\textbf{',
|
|
|
|
'\upshape' => '\textup{',
|
|
|
|
'\itshape' => '\textit{',
|
|
|
|
'\slshape' => '\textsl{',
|
|
|
|
'\scshape' => '\textsc{',
|
|
|
|
'\cal' => '\mathcal{',
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
# a faux "switch" statement. sets $_ for later use in pattern
|
|
|
|
# matching.
|
|
|
|
$type = ref($eaten);
|
|
|
|
$type =~ s/^Text::TeX::// or die "Non-Text::TeX object";
|
|
|
|
my $printstr = ""; # default for undefined printstrs etc.
|
|
|
|
SWITCH: for ($type) {
|
|
|
|
# Handle blank lines.
|
|
|
|
if (/Paragraph/) {
|
|
|
|
last SWITCH;
|
|
|
|
}
|
|
|
|
|
|
|
|
# Handle the end of a local font command - insert a '}'
|
|
|
|
if (/EndLocal/) {
|
2003-02-07 12:11:58 +00:00
|
|
|
$printstr = '}';
|
1999-09-27 18:44:28 +00:00
|
|
|
last SWITCH;
|
|
|
|
}
|
2004-10-28 14:35:53 +00:00
|
|
|
|
1999-09-27 18:44:28 +00:00
|
|
|
# $eaten->exact_print is undefined for previous environments
|
|
|
|
$outstr = $eaten->exact_print;
|
|
|
|
if (! defined $outstr) { # comment at end of paragraph
|
|
|
|
warn "Weird undefined token $eaten!" unless $eaten->comment;
|
|
|
|
last SWITCH;
|
|
|
|
}
|
2004-10-28 14:35:53 +00:00
|
|
|
|
1999-09-27 18:44:28 +00:00
|
|
|
# Handle LaTeX tokens
|
|
|
|
if (/^Token$/) {
|
|
|
|
my $realtok = $eaten->print; # w/out whitespace
|
|
|
|
# If a comment is its own paragraph, print nothing
|
|
|
|
last SWITCH unless defined($realtok);
|
|
|
|
# Special handling for \verb and \verb*
|
|
|
|
if ($realtok =~ /^\\verb\*?/) {
|
|
|
|
$printstr = &Verbatim::copy_verb($txt,$eaten);
|
|
|
|
last SWITCH;
|
|
|
|
}
|
|
|
|
|
|
|
|
# Translate token if necessary, or just print it
|
|
|
|
# "no_ws" is HACK to remove whitespace, so '\em ' -> '\emph{'
|
|
|
|
$printstr = &translate($outstr, \%no_ws_transtbl, "no_ws");
|
|
|
|
|
|
|
|
# Ignore optional argument(s) if necessary
|
|
|
|
$printstr .= &handle_opt_args($eaten,$txt);
|
|
|
|
|
|
|
|
last SWITCH;
|
|
|
|
}
|
|
|
|
|
|
|
|
# Tokens taking arguments, like '^'
|
|
|
|
# ADD '{' if there isn't one before the argument!
|
2000-03-29 23:02:36 +00:00
|
|
|
# TODO can we check whether the command is \label, \include
|
|
|
|
# and not add the braces in that case?
|
1999-09-27 18:44:28 +00:00
|
|
|
if (/^BegArgsToken$/) {
|
|
|
|
$printstr = $outstr;
|
|
|
|
|
|
|
|
# Ignore optional argument(s) if necessary
|
|
|
|
$printstr .= &handle_opt_args($eaten,$txt);
|
|
|
|
|
|
|
|
# Add beginning brace before the 1st argument if there isn't one
|
|
|
|
my $tok = $txt->lookAheadToken;
|
|
|
|
$printstr .= '{' unless ($tok =~ /\{/);
|
|
|
|
last SWITCH;
|
|
|
|
}
|
|
|
|
|
|
|
|
# End of one argument, beginning of next
|
|
|
|
# Note: by default ArgToken,EndArgsToken print nothing
|
|
|
|
# ADD '}' if there isn't one after the last argument
|
|
|
|
# Then read and print any optional arguments which may exist
|
|
|
|
# between this argument the next (we must do this here or we would
|
|
|
|
# add a '{' before an optional argument!)
|
|
|
|
# ADD '{' if there isn't one before the next argument!
|
|
|
|
# (just like we do in BegArgsToken and EndArgsToken)
|
|
|
|
if (/^ArgToken$/) {
|
|
|
|
$printstr = $outstr; # = ''
|
|
|
|
|
|
|
|
# Add '}' after the argument that ended if necessary
|
|
|
|
$printstr .= '}' unless $last_eaten->print eq "\}";
|
|
|
|
|
|
|
|
# Eat and print any optional arguments
|
|
|
|
$printstr .= &handle_opt_args($eaten,$txt);
|
|
|
|
|
|
|
|
# Add '{' before the next argument if necessary
|
|
|
|
my $tok = $txt->lookAheadToken;
|
|
|
|
$printstr .= '{' unless ($tok =~ /\{/);
|
|
|
|
last SWITCH;
|
|
|
|
}
|
|
|
|
|
|
|
|
# End of tokens taking arguments, like '^'
|
2004-10-28 14:35:53 +00:00
|
|
|
# ADD '}' if there isn't one after the last argument, i.e.,
|
1999-09-27 18:44:28 +00:00
|
|
|
# if the previous token *wasn't* a '}'
|
|
|
|
# Kludge: for TeX style \input command ("\input foo" with no
|
|
|
|
# braces) we need to read the whole filename, but parser will have
|
|
|
|
# read only one char. So read in the rest of the filename before
|
|
|
|
# printing the '}'.
|
|
|
|
if (/^EndArgsToken$/) {
|
|
|
|
$printstr = $outstr; # = ''
|
|
|
|
|
|
|
|
unless ($last_eaten->print eq "\}") {
|
|
|
|
my $s = $eaten->base_token;
|
|
|
|
if ($s->print eq "\\input") {
|
|
|
|
my $t = $txt->lookAheadToken;
|
|
|
|
# For one-char filename (a.tex) do nothing
|
|
|
|
if ($t =~ /^[\w.\-]/) {
|
|
|
|
my $u = $txt->eatMultiToken;
|
|
|
|
$t = $u->print;
|
|
|
|
$t =~ s/\s+//g;
|
|
|
|
$printstr .= $t;
|
|
|
|
}
|
|
|
|
# TeX \input always adds .tex ending
|
|
|
|
$printstr .= ".tex";
|
|
|
|
}
|
|
|
|
|
|
|
|
$printstr .= '}';
|
|
|
|
}
|
|
|
|
|
|
|
|
# Don't bother eating optional args coming after the last
|
|
|
|
# required arg: they'll just be copied as text
|
|
|
|
last SWITCH;
|
|
|
|
}
|
2004-10-28 14:35:53 +00:00
|
|
|
|
1999-09-27 18:44:28 +00:00
|
|
|
# Handle opening groups, like '{' and '$'.
|
|
|
|
if (/Begin::Group$/) {
|
2003-02-07 12:11:58 +00:00
|
|
|
$printstr = $outstr;
|
1999-09-27 18:44:28 +00:00
|
|
|
last SWITCH;
|
|
|
|
}
|
2004-10-28 14:35:53 +00:00
|
|
|
|
1999-09-27 18:44:28 +00:00
|
|
|
# Handle closing groups, like '}' and '$'.
|
|
|
|
if (/End::Group$/) {
|
2003-02-07 12:11:58 +00:00
|
|
|
$printstr = $outstr;
|
1999-09-27 18:44:28 +00:00
|
|
|
last SWITCH;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (/Begin::Group::Args/) {
|
|
|
|
my $env = $eaten->environment;
|
|
|
|
$printstr = $outstr;
|
|
|
|
if ($env eq "verbatim" || $env eq "reLyXskip") {
|
|
|
|
# copy everything up to "\end{foo}"
|
|
|
|
$printstr .= &Verbatim::copy_verbatim($txt, $eaten);
|
|
|
|
}
|
|
|
|
last SWITCH;
|
|
|
|
}
|
2004-10-28 14:35:53 +00:00
|
|
|
|
1999-09-27 18:44:28 +00:00
|
|
|
if (/End::Group::Args/) {
|
|
|
|
$printstr = $outstr;
|
|
|
|
last SWITCH;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (/Text/) {
|
|
|
|
$printstr = $outstr;
|
|
|
|
last SWITCH;
|
|
|
|
}
|
|
|
|
|
|
|
|
# The default action - print the string.
|
|
|
|
$printstr = $outstr;
|
|
|
|
} # end SWITCH:for ($type)
|
2004-10-28 14:35:53 +00:00
|
|
|
|
1999-09-27 18:44:28 +00:00
|
|
|
# Actually print the string
|
2004-10-28 14:35:53 +00:00
|
|
|
if (defined $printstr) {
|
1999-09-27 18:44:28 +00:00
|
|
|
print OUTFILE $printstr;
|
|
|
|
$last_eaten = $eaten; #save for next time
|
|
|
|
} else {warn "Undefined printstr";}
|
|
|
|
|
|
|
|
} # end sub clean_tex
|
|
|
|
|
|
|
|
#################### TRANSLATOR SUBROUTINES ###############################
|
|
|
|
sub translate {
|
|
|
|
# Replace a string (possibly with whitespace around it) with another
|
|
|
|
# Arg0 is a string, Arg1 is a reference to a hash containing translations
|
|
|
|
# If a token not in the table is passed in, do nothing
|
|
|
|
# If Arg2 is defined AND the token is known, then remove whitespace from
|
|
|
|
# the end of the translated token. This is a HACK to do '\em ' -> '\emph{'
|
|
|
|
# Return the string, possibly modified
|
|
|
|
my ($tokstr, $transref) = (shift, shift);
|
|
|
|
my $remove_ws = shift;
|
|
|
|
my %transtable = %$transref;
|
|
|
|
|
|
|
|
# remove whitespace from the string (since transtable doesn't have it)
|
|
|
|
my $stripstr = $tokstr;
|
|
|
|
$stripstr =~ s/^\s*(\S+)\s*$/$1/ or warn "couldn't strip token";
|
|
|
|
if ( exists $transtable{$stripstr} ) {
|
|
|
|
# use \Q or \, (, $, and [ will be misinterpreted
|
|
|
|
$tokstr =~ s/\Q$stripstr\E/$transtable{$stripstr}/;
|
|
|
|
|
|
|
|
# remove whitespace?
|
|
|
|
if (defined $remove_ws) {
|
|
|
|
$tokstr =~ s/\s*$//;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return $tokstr;
|
|
|
|
}
|
|
|
|
|
|
|
|
sub handle_opt_args {
|
|
|
|
# read and concatenate OR IGNORE optional arguments
|
|
|
|
# Arg0 is a BegArgsToken or ArgToken
|
|
|
|
my ($eaten,$fileobject) = (shift,shift);
|
|
|
|
my $outstr = "";
|
|
|
|
|
|
|
|
# If at end of paragraph, don't bother looking for optArgs
|
|
|
|
return "" unless $fileobject->lookAheadToken;
|
|
|
|
|
|
|
|
# Get the next argument(s) expected for this token == /^o*[rR]?$/
|
|
|
|
# If there are no args expected, just return
|
|
|
|
my $curr_args = $eaten->next_args($fileobject) or return "";
|
|
|
|
|
|
|
|
# Now print or ignore any optional arguments
|
|
|
|
# If there's an 'r' in curr_args, we're done for now
|
|
|
|
my $foo;
|
|
|
|
my $token_name = $eaten->token_name; # (needed for EndArgsToken, e.g.)
|
|
|
|
while ($curr_args =~ s/^o//) {
|
|
|
|
my $opt = $fileobject->eatOptionalArgument;
|
|
|
|
# Print any initial space before the optional argument
|
|
|
|
if ($foo = $opt->exact_print) {
|
|
|
|
if ($foo =~ /^(\s+)/) {
|
|
|
|
$outstr .= $1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Print the argument or ignore it
|
|
|
|
if ($opt->print) {
|
|
|
|
if (grep /^\Q$token_name\E$/, @DeleteOptArg) {
|
|
|
|
print "Optional argument '",$opt->print,
|
|
|
|
"' to macro $token_name ignored\n";
|
|
|
|
} else {
|
|
|
|
$outstr .= "[" . $opt->print . "]";
|
|
|
|
}
|
|
|
|
} # Was an optional argument found?
|
|
|
|
}
|
|
|
|
|
|
|
|
return $outstr;
|
|
|
|
} # end sub handle_opt_args
|
|
|
|
|
|
|
|
1; # return true value to calling program
|