#!/usr/bin/perl -- # -*- Perl -*- # this needs some cleanup... my $PSTOTEXT = "pstotext"; my $pdf = shift @ARGV; my $index = ""; my $inindex = 0; open (F, "$PSTOTEXT $pdf |"); while (<F>) { if (/^<\/index/) { $index .= $_; $inindex = 0; } $inindex = 1 if /^<index/; if ($inindex) { $index .= $_ if /^\s*</; } } my $cindex = ""; while ($index =~ /^(.*?)((<phrase role=\"pageno\">.*?<\/phrase>\s*)+)/s) { $cindex .= $1; $_ = $2; $index = $'; # ' my @pages = m/<phrase role=\"pageno\">.*?<\/phrase>\s*/sg; # Expand ranges if ($#pages >= 0) { my @mpages = (); foreach my $page (@pages) { my $pageno = &pageno($page); if ($pageno =~ /^([0-9]+)[^0-9]([0-9]+)$/) { # funky - for (my $count = $1; $count <= $2; $count++) { push (@mpages, "<phrase role=\"$pageno\">$count</phrase>"); } } else { push (@mpages, $page); } } @pages = sort rangesort @mpages; } # Remove duplicates... if ($#pages > 0) { my @mpages = (); my $current = ""; foreach my $page (@pages) { my $pageno = &pageno($page); if ($pageno ne $current) { push (@mpages, $page); $current = $pageno; } } @pages = @mpages; } # Collapse ranges... if ($#pages > 1) { my @cpages = (); while (@pages) { my $count = 0; my $len = &rangelen($count, @pages); if ($len <= 2) { my $page = shift @pages; push (@cpages, $page); } else { my $fpage = shift @pages; my $lpage = ""; while ($len > 1) { $lpage = shift @pages; $len--; } my $fpno = &pageno($fpage); my $lpno = &pageno($lpage); $fpage =~ s/>$fpno</>${fpno}-$lpno</s; push (@cpages, $fpage); } } @pages = @cpages; } my $page = shift @pages; $page =~ s/\s*$//s; $cindex .= $page; while (@pages) { $page = shift @pages; $page =~ s/\s*$//s; $cindex .= ", $page"; } } $cindex .= $index; print "$cindex\n"; sub pageno { my $page = shift; $page =~ s/^<phrase.*?>//; $page =~ s/^<link.*?>//; return $1 if $page =~ /^([^<>]+)/; return "?"; } sub rangesort { my $apno = &pageno($a); my $bpno = &pageno($b); # Make sure roman pages come before arabic ones, otherwise sort them in order return -1 if ($apno !~ /^\d+/ && $bpno =~ /^\d+/); return 1 if ($apno =~ /^\d+/ && $bpno !~ /^\d+/); return $apno <=> $bpno; } sub rangelen { my $count = shift; my @pages = @_; my $len = 1; my $inrange = 1; my $current = &pageno($pages[$count]); while ($count < $#pages && $inrange) { $count++; my $next = &pageno($pages[$count]); if ($current + 1 eq $next) { $current = $next; $inrange = 1; $len++; } else { $inrange = 0; } } return $len; }