#!/usr/bin/perl -w
# -*- cperl -*-
=head1 NAME
texWordCount
=head1 SYNOPSYS
=head1 DESCRIPTION
Counts words in LaTeX file
use the -u switch for updating word count in a file delimited with
% texWordCount - begin
% texWordCount - end
lines.
use the -i switch for chapters
=head1 HISTORY
ORIGIN: created from templateApp.pl version 3.3 by Min-Yen Kan
ORIGIN: modified by Sam Tygier
\input tags
ORIGIN: modified by Sam Tygier
ORIGIN: modified by Gregor Heinrich
ORIGIN: modified by Phil Bettinson
RCS:$Log: texWordCount,v $
RCS:Revision 1.4 Thu Mar 27 14:17:28 SGT 2008 kanmy
RCS:Merging Gregor's edits in for doc classes with chapter levels
RCS:
RCS:Revision 1.3 2007/05/10 08:53:13 kanmy
RCS:Merging Sam's edits for handling recursive input. See subprocedure for include_files
RCS:
RCS:Revision 1.2 2007/01/12 08:53:13 kanmy
RCS:Sam Tygier's edits to handle input
RCS:
RCS:Revision 1.1 2002/10/07 19:31:19 min
RCS:Initial revision
RCS:
=cut
require 5.0;
use Getopt::Std;
# use strict 'vars';
# use diagnostics;
# from Sam Tygier
# no warnings ;
### USER customizable section
my $tmpfile .= $0; $tmpfile =~ s/[\.\/]//g;
$tmpfile .= $$ . time;
if ($tmpfile =~ /^([-\@\w.]+)$/) { $tmpfile = $1; } # untaint tmpfile variable
$tmpfile = "/tmp/" . $tmpfile;
$0 =~ /([^\/]+)$/; my $progname = $1;
my $outputVersion = "1.0";
### END user customizable section
### Ctrl-C handler
sub quitHandler {
print STDERR "\n# $progname fatal\t\tReceived a 'SIGINT'\n# $progname - exiting cleanly\n";
exit;
}
### HELP Sub-procedure
sub Help {
print STDERR "usage: $progname -h\t\t\t\t[invokes help]\n";
print STDERR " $progname -v\t\t\t\t[invokes version]\n";
print STDERR " $progname [-duiq] filename(s)...\n";
print STDERR "Options:\n";
print STDERR "\t-d\tDebug Mode\n";
print STDERR "\t-i\tInclusive. Don't search for \{document\} tags. Good for \\includes\n";
print STDERR "\t-q\tQuiet Mode (don't echo license)\n";
print STDERR "\t-u\tUpdate word counts in file\n";
print STDERR "\n";
print STDERR "Will accept input on STDIN as a single file.\n";
print STDERR "\n";
}
### VERSION Sub-procedure
sub Version {
if (system ("perldoc $0")) {
die "Need \"perldoc\" in PATH to print version information";
}
exit;
}
sub License {
print STDERR "# Copyright 2002 \251 by Min-Yen Kan; modified by Sam Tygier, 2007. Modified by Phil Bettinson 2009\n";
}
###
### MAIN program
###
my $cmdLine = $0 . " " . join (" ", @ARGV);
if ($#ARGV == -1) { # invoked with no arguments, possible error in execution?
print STDERR "# $progname info\t\tNo arguments detected, waiting for input on command line.\n";
print STDERR "# $progname info\t\tIf you need help, stop this program and reinvoke with \"-h\".\n";
}
$SIG{'INT'} = 'quitHandler';
getopts ('dhiquv');
# our ($opt_d, $opt_h, $opt_i, $opt_q, $opt_u, $opt_v); # declare variables that are imported from Getopt::Std
my $many = 0;
my $manySum = 0;
# use (!defined $opt_X) for options with arguments
if (!$opt_q) { License(); } # call License, if asked for
if ($opt_v) { Version(); exit(0); } # call Version, if asked for
if ($opt_h) { Help(); exit (0); } # call help, if asked for
if (!$opt_q) {
print "# LaTeX word count file format $outputVersion produced by $progname\n";
print "# run as \"$cmdLine\"\n";
print "# format: \\t x level of detail, number of words \\t section name\n";
}
## standardize input stream (either STDIN on first arg on command line)
my $fh;
my $filename;
if ($filename = shift) {
NEWFILE:
if (!(-e $filename)) { die "# $progname crash\t\tFile \"$filename\" doesn't exist"; }
open (IF, $filename) || die "# $progname crash\t\tCan't open \"$filename\"";
$fh = "IF";
} else {
$filename = "
$fh = "STDIN";
}
# gregh: added chapter structure at depth 0 (section levels move down)
my $state = 0;
if ($opt_i) { $state = 1; }
my (@cSum, @sSum, @ssSum, @sssSum);
my (@cNames, @sNames, @ssNames, @sssNames);
my $sum = 0;
my @index = (0,0,0,0);
my $depth = 0;
my $line = 0;
my @lines;
my ($twcBegin, $twcEnd);
# read whole document and any included or inputed files
# includes and inputs can only be one deep so no need for
# recursion.
# warning, does not check \includeonly
my @headdoc = <$fh>;
my @wholedoc = include_files(@headdoc);
# chapter 0 for all text before first (or without any) chapter
$index[0]++;
foreach (@wholedoc) {
$lines[$line] = $_;
$line++;
if (/^\#/) { next; } # skip comments
elsif (/^\s+$/) { next; } # skip blank lines
elsif (/^\% texWordCount \- begin$/) { $twcBegin = $line-1; } # note twc begin
elsif (/^\% texWordCount \- end$/) { $twcEnd = $line-1; } # note twc end
else {
if (/^\\end\{document\}/) {
$state = 0;
if ($opt_d) { print "found end document - at word $sum, line $line\n"; }
}
if (/^\\begin\{figure\*?\}/) {
if ($opt_d) { print "found start figure - at word $sum, line $line\n"; }
$state = 0;
}
if (/^\\begin\{table\*?\}/) {
if ($opt_d) { print "found start table - at word $sum, line $line\n"; }
$state = 0;
}
if (/^\\chapter\*?\{(.+)\}/) {
if ($opt_d) { print "found chapter \"$1\" - at word $sum, line $line\n"; }
$index[0]++;
$cNames[$index[0]] = $1;
$depth = 0;
$index[1] = 0;
$index[2] = 0;
}
if (/^\\section\*?\{(.+)\}/) {
if ($opt_d) { print "found section \"$1\" - at word $sum, line $line\n"; }
if ($depth == 0) { $index[1] = 0; } else {$index[1]++; }
$depth = 1;
$sNames[$index[0]][$index[1]] = $1;
$index[2] = 0;
}
if (/\\footnote\*?{(.+)\}/) {
# print "\n\nIgnoring footnote \"$1\"\n\n";
}
if (/^\\subsection\*?\{(.+)\}/) {
if ($opt_d) { print "found subsection \"$1\" - at word $sum, line $line\n"; }
if ($depth == 2) { $index[2]++; } else {$index[2] = 0; }
$ssNames[$index[0]][$index[1]][$index[2]] = $1;
$depth = 2;
}
if (/^\\subsubsection\*?\{(.+)\}/) {
if ($opt_d) { print "found subsubsection \"$1\" - at word $sum, line $line\n"; }
if ($depth == 3) { $index[3]++; } else {$index[2] = 0; }
$sssNames[$index[0]][$index[1]][$index[2]][$index[3]] = $1;
$depth = 3;
}
if ($state == 1) {
s/\\%//g; # escape \%
s/%.+//g; # rid comments
s/\{/\{ /g;
s/\}/ \}/g;
$ignore=0;
$textit=0;
my @words = split (/ +/, $_); # split into words
for (my $i = 0; $i <= $#words; $i++) {
my $w = $words[$i];
if ($w =~ /[\s\S]*\}/) {
# print "\n*****End Footnote:$w\n";
if ($textit==1) {
$textit=0;
# print "Textit set to false\n";
}
else
{
$ignore=0;
# print "End footnote\n";
}
# print "\n\n";
}
if ($w =~ /[\s\S]*\\footnote/) {
# print "Start footnote\n ";
$ignore=1;
}
if ($w =~ /[\s\S]*\\textit\{[\s\S]*/){
# print "Textit started\n";
if($ignore==1)
{
$textit=1;
# print "Textit set to true\n";
}
}
if ($w =~ /^\{\\/) { next; } # rule out environment starts
if ($w =~ /^\\/) { next; }
if ($w !~ /\w/) { next; }
#print $w, " ";
$cSum[$index[0]]++;
if ($depth >= 1) {
$sSum[$index[0]][$index[1]]++;
}
if ($depth >= 2) {
$ssSum[$index[0]][$index[1]][$index[2]]++;
}
if ($depth >= 3) {
$sssSum[$index[0]][$index[1]][$index[2]][$index[3]]++;
}
if ($ignore!=1){
$sum++;
}
else {
# print "$w ";
}
}
}
if (/^\\end\{figure\*?\}/) {
if ($opt_d) { print "found end figure - resuming at line $line\n"; }
$state = 1;
}
if (/^\\end\{table\*?\}/) {
if ($opt_d) { print "found end table - resuming at line $line\n"; }
$state = 1;
}
if (/^\\begin\{document\}/) {
if ($opt_d) { print "found start document - starting count at line $line\n"; }
@index = (0,0,0,0);
$depth = 0;
$state = 1;
}
}
}
close ($fh);
# prepare output
my $buf = "";
$buf .= "$sum\twhole - $filename\n";
my $tab = "";
my $i = 0;
while (defined $cSum[$i]) {
# if any chapter mark found
if ($#cSum > 0) {
if ($i == 0) {
$buf .= "\t$cSum[$i]\t(Before Chapter 1)\n";
$tab = "\t";
} else {
$buf .= "\t$cSum[$i]\tChapter " . ($i) . " - $cNames[$i] \n";
}
}
my $j = 0;
while (defined $sSum[$i][$j]) {
$buf .= "$tab\t$sSum[$i][$j]\tSection " . ($j+1) . " - $sNames[$i][$j] \n";
my $k = 0;
while (defined $ssSum[$i][$j][$k]) {
$buf .= "$tab\t\t$ssSum[$i][$j][$k]\tSubsection " . ($k+1) . " - $ssNames[$i][$j][$k] \n";
my $ell = 0;
while (defined $sssSum[$i][$j][$k][$ell]) {
$buf .= "$tab\t\t$sssSum[$i][$j][$k][$ell]\tSubsubsection " . ($k+1) . " - $sssNames[$i][$j][$k][$ell] \n";
$ell++;
}
$k++;
}
$j++;
}
$i++;
}
print $buf;
if (defined $twcBegin &&
defined $twcEnd &&
$opt_u) {
# untaint $filename variable
if ($filename =~ /^([-\@\w.]+)$/) {
$filename = $1; # $filename now untainted
} else {
die "# $progname fatal\t\tTainted data in \"$filename\""; # log this somewhere
}
open (OF, ">$filename") || die "# $progname fatal\t\tcan't rewrite word count section\n";
for (my $i = 0; $i < $twcBegin; $i++) {
print OF $lines[$i];
}
my @bufLines = split (/\n/,$buf);
print OF "% texWordCount - begin\n";
print OF "% generated on " . localtime(time()) . "\n";
print OF "% command was \"$cmdLine\"\n";
for (my $k = 0; $k <= $#bufLines; $k++) {
print OF "% ", $bufLines[$k], "\n";
}
print OF "% texWordCount - end\n";
for (my $i = $twcEnd+1; $i < $#lines; $i++) {
print OF $lines[$i];
}
close (OF);
}
if ($filename = shift) {
$many = 1;
$manySum += $sum;
undef @cSum;
undef @sSum;
undef @ssSum;
undef @sssSum;
undef @cNames;
undef @sNames;
undef @ssNames;
undef @sssNames;
undef @lines;
goto NEWFILE;
}
if ($many == 1) {
$manySum += $sum;
print "$manySum\tall\n";
}
###
### END of main program
###
# From Sam Tygier (10 May 2007)
sub include_files{
my @part_doc = ();
my @whole_doc = ();
push(@part_doc, @_);
foreach (@part_doc) {
if (/^\\(input|include){([^}]*)}/) {
open (INPUT, $2.".tex") || die "# $progname crash\t\tCan't
open \"$2\"";
my @sub_doc = ;
push(@whole_doc, include_files(@sub_doc));
} else {
push(@whole_doc, $_);
}
}
return @whole_doc;
}
Leave a Reply
You must be logged in to post a comment.