en_ulb_tagged/Check_ULB.pl

144 lines
3.4 KiB
Perl
Executable File

# Checks ULB in ULB.xml against tagged ULB
use 5.18.0;
use File::Slurp;
use File::Find ;
use Cwd ;
use utf8;
#use open IN => ":utf8", OUT => ":utf8";
use open IO => ":utf8";
open(LOG, ">Logs/Log.txt") or die "$!";
open(OUT1, ">Output/Standard.txt") or die "$!";
open(OUT2, ">Output/Tagged.txt") or die "$!";
my ($ULBxml, $taggedULBDir) = ("/Users/dillardfam/Documents/WA/WACS/fork/ULB_xml/ULB.xml", "/Users/dillardfam/Documents/WA/WACS/fork/Manual_Tagging");
my (@filesToRun) = ();
my %fullName;
my $filePattern = "\55-1TI.xml" ;
my $file;
my $xmlText = read_file("$ULBxml", binmode => 'utf8');
GetBooksToCheck();
Compare();
sub Compare {
foreach my $file (@filesToRun) {
say LOG "|$file|, |$taggedULBDir/$file.xml|";
my $taggedText = read_file("$taggedULBDir/$file.xml", binmode => 'utf8');
GetGist($file, $taggedText);
}
}
sub GetGist {
my ($fileName, $wholeTaggedText) = @_;
my ($verseRef, $standard, $tagged);
say LOG "|$fileName|, |$fullName{$fileName}|";
# while ($wholeTaggedText =~ /<verse name="($fullName{$fileName} \d+:\d+)">((.|\n)*?)<preText>(.*?)<\/preText>((.|\n)*?)\n\t+((<w ((.|\n)*?)\n)*)\t+<\/verse>/sg) {
while ($wholeTaggedText =~ /<verse name="($fullName{$fileName} \d+:\d+)">((.|\n)*?)<preText>(.*?)<\/preText>((.|\n)*?)\n\t+((<(w|usfm)(>| )((.|\n)*?)\n)*)\t+<\/verse>/sg) {
my ($preText, $gist) = ($4, $7);
$gist =~ s/<comment>.*?<\/comment>//sg;
$verseRef = $1;
if ($xmlText =~ /<verse name="$verseRef">\n\t+<preText>([^\n]*)<\/preText>\n\t+<text>([^\n]*)<\/text>\n\t+<\/verse>/s) {
my ($standardPT, $standardT) = ($1, $2);
($tagged) = Untag($preText, $gist);
#say LOG $tagged;
$standard = $standardPT . " " . $standardT;
$standard =~ s/<[^<>]*>//g;
$standard =~ s/ {2,}/ /g;
$standard =~ s/ +$//;
if ($standard ne $tagged) {
say LOG "\n$verseRef\nMISMATCH:\n\$standard\n$standard\n\$tagged\n$tagged\n";
say OUT1 $standard;
say OUT2 $tagged;
}
}
}
}
sub Untag {
my ($pre, $txt) = ($_[0], $_[1]);
say LOG "<00>\t\$pre: $pre\n\$txt: $txt";
if ($txt =~ /\[\d\]/) {
$txt = Reorder($txt);
say LOG "<0>\t$txt";
}
while ($txt =~ s/<phrase>\n(\t*<w [^>]*>[^<]*<\/w>\n)*\t*<phraseWords>([^<]*)<\/phraseWords>\n\t*<\/phrase>/<w>$2<\/w>/sg) {
my $phraseWords = $2;
#$txt = Phrase($txt);
say LOG "<0a>\$phraseWords: $phraseWords"
}
$txt =~ s/[\t\n]/ /g;
$txt =~ s/(√|<[^<>]*>)//g;
$txt = $pre . " " . $txt;
$txt =~ s/ {2,}/ /g;
$txt =~ s/^(.+[^ ])(\\)/$1 $2/g;
$txt =~ s/ +$//;
$txt =~ s/— +/—/g;
# say LOG "<1>\t$txt";
return $txt;
}
sub Reorder {
my $txt = $_[0];
say LOG "<R1>\t$txt";
while ($txt =~ s/sub="\[(\d+)\]" ?([^>]*>)([^<]*)(<(.|\n)*?>[^<]*)\[\1\]([^<]*<)/$2√$4$3$6/s) {}
say LOG "<R2>\t$txt";
return $txt
}
sub GetBooksToCheck {
while (<DATA>) {
chomp;
unless (/^#/) {
if (/([^\t]*)\t([^\t]*)/) {
my ($file, $book) = ($1, $2);
say "|$file|";
push @filesToRun, "$file";
$fullName{$file} = $book;
}
}
}
}
close OUT1; close OUT2;
close LOG;
say "\nDone.";
# =====
__DATA__
#41-MAT Matthew
#42-MRK Mark
#43-LUK Luke
#44-JHN John
#45-ACT Acts
#46-ROM Romans
#47-1CO 1 Corinthians
#48-2CO 2 Corinthians
#49-GAL Galatians
#50-EPH Ephesians
#51-PHP Philippians
#52-COL Colossians
#53-1TH 1 Thessalonians
#54-2TH 2 Thessalonians
#55-1TI 1 Timothy
#56-2TI 2 Timothy
#57-TIT Titus
#58-PHM Philemon
#59-HEB Hebrews
#60-JAS James
#61-1PE 1 Peter
#62-2PE 2 Peter
#63-1JN 1 John
#64-2JN 2 John
65-3JN 3 John
#66-JUD Jude
#67-REV Revelation