Subversion Repositories svn LFS-FR

Rev

Blame | Last modification | View Log | RSS feed

#!/usr/bin/perl -- # -*- Perl -*-

# this needs some cleanup...

my $PSTOTEXT = "pstotext";

my $pdf = shift @ARGV;

my $index = "";
my $inindex = 0;
open (F, "$PSTOTEXT $pdf |");
while (<F>) {
    if (/^<\/index/) {
        $index .= $_;
        $inindex = 0;
    }
    $inindex = 1 if /^<index/;

    if ($inindex) {
        $index .= $_ if /^\s*</;
    }
}

my $cindex = "";
while ($index =~ /^(.*?)((<phrase role=\"pageno\">.*?<\/phrase>\s*)+)/s) {
    $cindex .= $1;
    $_ = $2;
    $index = $'; # '

    my @pages = m/<phrase role=\"pageno\">.*?<\/phrase>\s*/sg;

    # Expand ranges
    if ($#pages >= 0) {
        my @mpages = ();
        foreach my $page (@pages) {
            my $pageno = &pageno($page);
            if ($pageno =~ /^([0-9]+)[^0-9]([0-9]+)$/) { # funky -
                for (my $count = $1; $count <= $2; $count++) {
                    push (@mpages, "<phrase role=\"$pageno\">$count</phrase>");
                }
            } else {
                push (@mpages, $page);
            }
        }
        @pages = sort rangesort @mpages;
    }

    # Remove duplicates...
    if ($#pages > 0) {
        my @mpages = ();
        my $current = "";
        foreach my $page (@pages) {
            my $pageno = &pageno($page);
            if ($pageno ne $current) {
                push (@mpages, $page);
                $current = $pageno;
            }
        }
        @pages = @mpages;
    }

    # Collapse ranges...
    if ($#pages > 1) {
        my @cpages = ();
        while (@pages) {
            my $count = 0;
            my $len = &rangelen($count, @pages);
            if ($len <= 2) {
                my $page = shift @pages;
                push (@cpages, $page);
            } else {
                my $fpage = shift @pages;
                my $lpage = "";
                while ($len > 1) {
                    $lpage = shift @pages;
                    $len--;
                }
                my $fpno = &pageno($fpage);
                my $lpno = &pageno($lpage);
                $fpage =~ s/>$fpno</>${fpno}-$lpno</s;
                push (@cpages, $fpage);
            }
        }
        @pages = @cpages;
    }

    my $page = shift @pages;
    $page =~ s/\s*$//s;
    $cindex .= $page;
    while (@pages) {
        $page = shift @pages;
        $page =~ s/\s*$//s;
        $cindex .= ", $page";
    }
}
$cindex .= $index;

print "$cindex\n";

sub pageno {
    my $page = shift;

    $page =~ s/^<phrase.*?>//;
    $page =~ s/^<link.*?>//;

    return $1 if $page =~ /^([^<>]+)/;
    return "?";
}

sub rangesort {
    my $apno = &pageno($a);
    my $bpno = &pageno($b);

    # Make sure roman pages come before arabic ones, otherwise sort them in order
    return -1 if ($apno !~ /^\d+/ && $bpno =~ /^\d+/);
    return  1 if ($apno =~ /^\d+/ && $bpno !~ /^\d+/);
    return $apno <=> $bpno;
}

sub rangelen {
    my $count = shift;
    my @pages = @_;
    my $len = 1;
    my $inrange = 1;

    my $current = &pageno($pages[$count]);
    while ($count < $#pages && $inrange) {
        $count++;
        my $next = &pageno($pages[$count]);
        if ($current + 1 eq $next) {
            $current = $next;
            $inrange = 1;
            $len++;
        } else {
            $inrange = 0;
        }
    }

    return $len;
}