All Downloads are FREE. Search and download functionalities are using the official Maven repository.

docbook.fo.pdf2index Maven / Gradle / Ivy

There is a newer version: 2.4
Show newest version
#!/usr/bin/perl -- # -*- Perl -*-

# this needs some cleanup...

my $PSTOTEXT = "pstotext";

my $pdf = shift @ARGV;

my $index = "";
my $inindex = 0;
open (F, "$PSTOTEXT $pdf |");
while () {
    if (/^<\/index/) {
	$index .= $_;
	$inindex = 0;
    }
    $inindex = 1 if /^.*?<\/phrase>\s*)+)/s) {
    $cindex .= $1;
    $_ = $2;
    $index = $'; # '

    my @pages = m/.*?<\/phrase>\s*/sg;

    # Expand ranges
    if ($#pages >= 0) {
	my @mpages = ();
	foreach my $page (@pages) {
	    my $pageno = &pageno($page);
	    if ($pageno =~ /^([0-9]+)[^0-9]([0-9]+)$/) { # funky -
		for (my $count = $1; $count <= $2; $count++) {
		    push (@mpages, "$count");
		}
	    } else {
		push (@mpages, $page);
	    }
	}
	@pages = sort rangesort @mpages;
    }

    # Remove duplicates...
    if ($#pages > 0) {
	my @mpages = ();
	my $current = "";
	foreach my $page (@pages) {
	    my $pageno = &pageno($page);
	    if ($pageno ne $current) {
		push (@mpages, $page);
		$current = $pageno;
	    }
	}
	@pages = @mpages;
    }

    # Collapse ranges...
    if ($#pages > 1) {
	my @cpages = ();
	while (@pages) {
	    my $count = 0;
	    my $len = &rangelen($count, @pages);
	    if ($len <= 2) {
		my $page = shift @pages;
		push (@cpages, $page);
	    } else {
		my $fpage = shift @pages;
		my $lpage = "";
		while ($len > 1) {
		    $lpage = shift @pages;
		    $len--;
		}
		my $fpno = &pageno($fpage);
		my $lpno = &pageno($lpage);
		$fpage =~ s/>$fpno${fpno}-$lpno//;
    $page =~ s/^//;

    return $1 if $page =~ /^([^<>]+)/;
    return "?";
}

sub rangesort {
    my $apno = &pageno($a);
    my $bpno = &pageno($b);

    # Make sure roman pages come before arabic ones, otherwise sort them in order
    return -1 if ($apno !~ /^\d+/ && $bpno =~ /^\d+/);
    return  1 if ($apno =~ /^\d+/ && $bpno !~ /^\d+/);
    return $apno <=> $bpno;
}

sub rangelen {
    my $count = shift;
    my @pages = @_;
    my $len = 1;
    my $inrange = 1;

    my $current = &pageno($pages[$count]);
    while ($count < $#pages && $inrange) {
	$count++;
	my $next = &pageno($pages[$count]);
	if ($current + 1 eq $next) {
	    $current = $next;
	    $inrange = 1;
	    $len++;
	} else {
	    $inrange = 0;
	}
    }

    return $len;
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy