en_ulb_tagged/Build_ULB_XML_for_Tagging.pl

127 lines
3.7 KiB
Perl
Executable File

# Creates workable ULB.xml file that has all USFM markers in place.
use 5.18.0;
use File::Slurp;
use File::Find ;
use Cwd ;
use utf8;
#use open IN => ":utf8", OUT => ":utf8";
use open IO => ":utf8";
open(LOG, ">Logs/Log.txt") or die "$!";
open(OUT, ">/Users/dillardfam/Documents/WA/WACS/fork/ULB_xml/ULB.xml") or die "$!";
say OUT "<xml>";
my ($topDir, $outDir) = ("/Users/dillardfam/Documents/WA/WACS/en_ulb", "/Users/dillardfam/Documents/WA/WACS/fork/ULB_xml");
my @filesToRun = ();
my $filePattern = '\.usfm' ;
#my $filePattern = '67-REV\.usfm' ;
my $file;
find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ;
@filesToRun = sort @filesToRun;
ReadFiles();
say OUT "</xml>";
close OUT;
close LOG;
say "\nDone.";
# =====
sub ReadFiles {
foreach $file ( @filesToRun ) {
say $file;
my @array;
my $fileText = read_file("$file", binmode => 'utf8');
$fileText =~ s/[ \n]+$//;
say LOG "|$fileText|";
#Delete \n
my ($book, $chap, $vers, $chapStart);
if ($fileText =~ /\\h ([^\n]*)/) {
$book = $1
}
#say LOG $book;
$fileText =~ s/\n/ /g;
$fileText =~ s/ / /g;
$fileText =~ s/\\s5/\n$&/g;
$fileText =~ s/\\v/√/g;
while ($fileText =~ s/(√[^√\n]*)(√)/$1\n$2/) {}
$fileText =~ s/√/\\v/g;
$fileText =~ s/(\\id[^\n]*)\n/\t\t<heading>$1<\/heading>\n/;
$fileText =~ s/ +\n/\n/g;
$fileText =~ s/(\\(q\d?|pi?|m|n?b))\n/\n$1 /g;
#say LOG $fileText;
@array = split /\n/, $fileText;
$fileText = "";
foreach my $line (@array) {
chomp;
if ($line =~ /<book name="(.*?)">/) {$book = $1;}
if ($line =~ /\\c (\d+).* \\v (\d+)/) {
($chap, $vers) = ($1, $2);
$line = "\t\t<chapter name=\"$book $chap\">\n\t\t\t<verse name=\"$book $chap:$vers\">$line</verse>";
$line = "\t\t</chapter>\n$line" if $chapStart;
$chapStart = 1;
}
elsif ($line =~ /\\v (\d+)/) {
$vers = $1;
$line = "\t\t\t<verse name=\"$book $chap:$vers\">$line</verse>"
}
#say LOG "===\n<AA>\n$line";
$line =~ s/(<verse[^>]*>)(.*\\v \d+ )(.*)(<\/verse>)/$1\n\t\t\t\t<preText>$2<\/preText>\n\t\t\t\t<text>$3<\/text>\n\t\t\t$4/s;
#say LOG "===\n<BB>\n$line";
if ($line =~ /<text>.*<\/text>/p) {
say LOG "<-0>\t$line";
my ($pre, $match, $post) = (${^PREMATCH}, ${^MATCH}, ${^POSTMATCH});
#say LOG "<-1>\t\$pre: $pre,\n\$match: $match,\n\$post: $post";
$match = TagInternalUSFM ($match);
$line = $pre . $match . $post;
}
say LOG "---\n<CC>\n$line\n===";
$line =~ s# +</#</#g;
$fileText .= $line . "\n";
}
say OUT "\t<book name=\"$book\">\n$fileText\t\t</chapter>\n\t</book>";
}
}
sub TagInternalUSFM {
my ($line, $placeNum) = ($_[0], 1);
my %places;
#say LOG "Tagging internal USFM in \$line $line.";
while ($line =~ /(<text>.*)(\\f .*?\\f\*)(.*<\/text>)/g) {
#say LOG "<+1>\t$2";
$line =~ s/(<text>.*)(\\f .*?\\f\*)(.*<\/text>)/$1<place number="$placeNum"\/>$3/;
$places{$placeNum} = $2;
$placeNum ++;
}
#say LOG "<+2>\t$line";
while ($line =~ /(<text>.*)(\\qs .*?\\qs\*)(.*<\/text>)/g) {
#say LOG "<+3>\t$2";
$line =~ s/(<text>.*)(\\qs .*?\\qs\*)(.*<\/text>)/$1<place number="$placeNum"\/>$3/;
$places{$placeNum} = $2;
$placeNum ++;
}
#say LOG "<+4>\t$line";
while ($line =~ /(<text>.*)(\\([bm]|pi?|q\d?|s2))( .*<\/text>)/g) {
#say LOG "<+5>\t$2";
$line =~ s/(<text>.*)(\\([bm]|pi?|q\d?|s2))(.*<\/text>)/$1<place number="$placeNum"\/>$4/;
$places{$placeNum} = $2;
$placeNum ++;
}
#say LOG "<+6>\t$line";
$line =~ s/ / /g;
#say LOG "<+7>\t$line";
foreach my $place (sort keys %places) {
#say LOG "<+8>\tReplacing <place number=\"$place\"\/> with <usfm>$places{$place}<\/usfm> in\n$line.";
unless ($line =~ s/<place number="$place"\/>/<usfm>$places{$place}<\/usfm>/) {die}
}
say LOG "<+9>\t$line";
return $line;
}