docbook.fo.pdf2index Maven / Gradle / Ivy
#!/usr/bin/perl -- # -*- Perl -*-
# this needs some cleanup...
my $PSTOTEXT = "pstotext";
my $pdf = shift @ARGV;
my $index = "";
my $inindex = 0;
open (F, "$PSTOTEXT $pdf |");
while () {
if (/^<\/index/) {
$index .= $_;
$inindex = 0;
}
$inindex = 1 if /^.*?<\/phrase>\s*)+)/s) {
$cindex .= $1;
$_ = $2;
$index = $'; # '
my @pages = m/.*?<\/phrase>\s*/sg;
# Expand ranges
if ($#pages >= 0) {
my @mpages = ();
foreach my $page (@pages) {
my $pageno = &pageno($page);
if ($pageno =~ /^([0-9]+)[^0-9]([0-9]+)$/) { # funky -
for (my $count = $1; $count <= $2; $count++) {
push (@mpages, "$count ");
}
} else {
push (@mpages, $page);
}
}
@pages = sort rangesort @mpages;
}
# Remove duplicates...
if ($#pages > 0) {
my @mpages = ();
my $current = "";
foreach my $page (@pages) {
my $pageno = &pageno($page);
if ($pageno ne $current) {
push (@mpages, $page);
$current = $pageno;
}
}
@pages = @mpages;
}
# Collapse ranges...
if ($#pages > 1) {
my @cpages = ();
while (@pages) {
my $count = 0;
my $len = &rangelen($count, @pages);
if ($len <= 2) {
my $page = shift @pages;
push (@cpages, $page);
} else {
my $fpage = shift @pages;
my $lpage = "";
while ($len > 1) {
$lpage = shift @pages;
$len--;
}
my $fpno = &pageno($fpage);
my $lpno = &pageno($lpage);
$fpage =~ s/>$fpno>${fpno}-$lpno//;
$page =~ s/^//;
return $1 if $page =~ /^([^<>]+)/;
return "?";
}
sub rangesort {
my $apno = &pageno($a);
my $bpno = &pageno($b);
# Make sure roman pages come before arabic ones, otherwise sort them in order
return -1 if ($apno !~ /^\d+/ && $bpno =~ /^\d+/);
return 1 if ($apno =~ /^\d+/ && $bpno !~ /^\d+/);
return $apno <=> $bpno;
}
sub rangelen {
my $count = shift;
my @pages = @_;
my $len = 1;
my $inrange = 1;
my $current = &pageno($pages[$count]);
while ($count < $#pages && $inrange) {
$count++;
my $next = &pageno($pages[$count]);
if ($current + 1 eq $next) {
$current = $next;
$inrange = 1;
$len++;
} else {
$inrange = 0;
}
}
return $len;
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy