2020-07-28 21:34:00 +00:00
|
|
|
# Checks ULB in ULB.xml against tagged ULB
|
|
|
|
use 5.18.0;
|
|
|
|
use File::Slurp;
|
|
|
|
use File::Find ;
|
|
|
|
use Cwd ;
|
|
|
|
use utf8;
|
|
|
|
#use open IN => ":utf8", OUT => ":utf8";
|
|
|
|
use open IO => ":utf8";
|
|
|
|
|
|
|
|
open(LOG, ">Logs/Log.txt") or die "$!";
|
2020-07-31 18:47:37 +00:00
|
|
|
open(OUT1, ">Output/Standard.txt") or die "$!";
|
|
|
|
open(OUT2, ">Output/Tagged.txt") or die "$!";
|
2020-07-28 21:34:00 +00:00
|
|
|
|
2020-08-20 18:50:24 +00:00
|
|
|
my ($ULBxml, $taggedULBDir) = ("/Users/dillardfam/Documents/WA/WACS/fork/ULB_xml/ULB.xml", "/Users/dillardfam/Documents/WA/WACS/fork/Manual_Tagging");
|
2020-07-28 21:34:00 +00:00
|
|
|
my (@filesToRun) = ();
|
|
|
|
my %fullName;
|
2021-04-02 15:06:18 +00:00
|
|
|
my $filePattern = "\55-1TI.xml" ;
|
2020-07-28 21:34:00 +00:00
|
|
|
my $file;
|
|
|
|
|
|
|
|
my $xmlText = read_file("$ULBxml", binmode => 'utf8');
|
|
|
|
|
|
|
|
GetBooksToCheck();
|
|
|
|
Compare();
|
|
|
|
|
|
|
|
sub Compare {
|
|
|
|
foreach my $file (@filesToRun) {
|
|
|
|
|
|
|
|
say LOG "|$file|, |$taggedULBDir/$file.xml|";
|
|
|
|
my $taggedText = read_file("$taggedULBDir/$file.xml", binmode => 'utf8');
|
|
|
|
|
|
|
|
GetGist($file, $taggedText);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
sub GetGist {
|
|
|
|
my ($fileName, $wholeTaggedText) = @_;
|
|
|
|
my ($verseRef, $standard, $tagged);
|
|
|
|
say LOG "|$fileName|, |$fullName{$fileName}|";
|
2020-08-20 18:50:24 +00:00
|
|
|
# while ($wholeTaggedText =~ /<verse name="($fullName{$fileName} \d+:\d+)">((.|\n)*?)<preText>(.*?)<\/preText>((.|\n)*?)\n\t+((<w ((.|\n)*?)\n)*)\t+<\/verse>/sg) {
|
2020-09-07 21:28:08 +00:00
|
|
|
while ($wholeTaggedText =~ /<verse name="($fullName{$fileName} \d+:\d+)">((.|\n)*?)<preText>(.*?)<\/preText>((.|\n)*?)\n\t+((<(w|usfm)(>| )((.|\n)*?)\n)*)\t+<\/verse>/sg) {
|
2020-07-28 21:34:00 +00:00
|
|
|
my ($preText, $gist) = ($4, $7);
|
2020-08-20 18:50:24 +00:00
|
|
|
$gist =~ s/<comment>.*?<\/comment>//sg;
|
2020-07-28 21:34:00 +00:00
|
|
|
$verseRef = $1;
|
|
|
|
if ($xmlText =~ /<verse name="$verseRef">\n\t+<preText>([^\n]*)<\/preText>\n\t+<text>([^\n]*)<\/text>\n\t+<\/verse>/s) {
|
|
|
|
my ($standardPT, $standardT) = ($1, $2);
|
2020-08-20 18:50:24 +00:00
|
|
|
|
2020-07-28 21:34:00 +00:00
|
|
|
($tagged) = Untag($preText, $gist);
|
2020-08-20 18:50:24 +00:00
|
|
|
|
2020-07-28 21:34:00 +00:00
|
|
|
#say LOG $tagged;
|
|
|
|
$standard = $standardPT . " " . $standardT;
|
|
|
|
$standard =~ s/<[^<>]*>//g;
|
|
|
|
$standard =~ s/ {2,}/ /g;
|
|
|
|
$standard =~ s/ +$//;
|
|
|
|
if ($standard ne $tagged) {
|
2020-07-31 18:47:37 +00:00
|
|
|
say LOG "\n$verseRef\nMISMATCH:\n\$standard\n$standard\n\$tagged\n$tagged\n";
|
|
|
|
say OUT1 $standard;
|
|
|
|
say OUT2 $tagged;
|
2020-07-28 21:34:00 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
sub Untag {
|
|
|
|
my ($pre, $txt) = ($_[0], $_[1]);
|
2020-08-11 21:50:50 +00:00
|
|
|
say LOG "<00>\t\$pre: $pre\n\$txt: $txt";
|
|
|
|
if ($txt =~ /\[\d\]/) {
|
|
|
|
$txt = Reorder($txt);
|
|
|
|
say LOG "<0>\t$txt";
|
|
|
|
}
|
2020-08-12 21:06:50 +00:00
|
|
|
while ($txt =~ s/<phrase>\n(\t*<w [^>]*>[^<]*<\/w>\n)*\t*<phraseWords>([^<]*)<\/phraseWords>\n\t*<\/phrase>/<w>$2<\/w>/sg) {
|
|
|
|
my $phraseWords = $2;
|
|
|
|
#$txt = Phrase($txt);
|
|
|
|
say LOG "<0a>\$phraseWords: $phraseWords"
|
|
|
|
}
|
|
|
|
|
2020-07-28 21:34:00 +00:00
|
|
|
$txt =~ s/[\t\n]/ /g;
|
|
|
|
$txt =~ s/(√|<[^<>]*>)//g;
|
|
|
|
$txt = $pre . " " . $txt;
|
|
|
|
$txt =~ s/ {2,}/ /g;
|
2020-07-29 19:47:57 +00:00
|
|
|
$txt =~ s/^(.+[^ ])(\\)/$1 $2/g;
|
2020-07-28 21:34:00 +00:00
|
|
|
$txt =~ s/ +$//;
|
2021-04-02 15:06:18 +00:00
|
|
|
$txt =~ s/— +/—/g;
|
2020-07-29 19:47:57 +00:00
|
|
|
# say LOG "<1>\t$txt";
|
2020-07-28 21:34:00 +00:00
|
|
|
return $txt;
|
|
|
|
}
|
|
|
|
|
2020-08-11 21:50:50 +00:00
|
|
|
sub Reorder {
|
|
|
|
my $txt = $_[0];
|
|
|
|
say LOG "<R1>\t$txt";
|
2020-08-13 20:51:29 +00:00
|
|
|
while ($txt =~ s/sub="\[(\d+)\]" ?([^>]*>)([^<]*)(<(.|\n)*?>[^<]*)\[\1\]([^<]*<)/$2√$4$3$6/s) {}
|
2020-08-11 21:50:50 +00:00
|
|
|
say LOG "<R2>\t$txt";
|
|
|
|
return $txt
|
|
|
|
}
|
|
|
|
|
2020-07-28 21:34:00 +00:00
|
|
|
sub GetBooksToCheck {
|
|
|
|
while (<DATA>) {
|
|
|
|
chomp;
|
|
|
|
unless (/^#/) {
|
|
|
|
if (/([^\t]*)\t([^\t]*)/) {
|
|
|
|
my ($file, $book) = ($1, $2);
|
|
|
|
say "|$file|";
|
|
|
|
push @filesToRun, "$file";
|
|
|
|
$fullName{$file} = $book;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-07-31 18:47:37 +00:00
|
|
|
|
|
|
|
close OUT1; close OUT2;
|
2020-07-28 21:34:00 +00:00
|
|
|
close LOG;
|
|
|
|
|
|
|
|
say "\nDone.";
|
|
|
|
# =====
|
|
|
|
__DATA__
|
2020-08-20 18:50:24 +00:00
|
|
|
#41-MAT Matthew
|
2020-07-28 21:34:00 +00:00
|
|
|
#42-MRK Mark
|
2021-04-02 15:06:18 +00:00
|
|
|
#43-LUK Luke
|
2020-07-28 21:34:00 +00:00
|
|
|
#44-JHN John
|
|
|
|
#45-ACT Acts
|
|
|
|
#46-ROM Romans
|
|
|
|
#47-1CO 1 Corinthians
|
|
|
|
#48-2CO 2 Corinthians
|
|
|
|
#49-GAL Galatians
|
|
|
|
#50-EPH Ephesians
|
2021-04-02 15:06:18 +00:00
|
|
|
#51-PHP Philippians
|
2020-08-31 15:40:38 +00:00
|
|
|
#52-COL Colossians
|
2020-07-28 21:34:00 +00:00
|
|
|
#53-1TH 1 Thessalonians
|
|
|
|
#54-2TH 2 Thessalonians
|
|
|
|
#55-1TI 1 Timothy
|
|
|
|
#56-2TI 2 Timothy
|
2020-07-29 19:47:57 +00:00
|
|
|
#57-TIT Titus
|
2020-07-28 21:34:00 +00:00
|
|
|
#58-PHM Philemon
|
|
|
|
#59-HEB Hebrews
|
|
|
|
#60-JAS James
|
|
|
|
#61-1PE 1 Peter
|
|
|
|
#62-2PE 2 Peter
|
|
|
|
#63-1JN 1 John
|
|
|
|
#64-2JN 2 John
|
2021-04-16 16:07:58 +00:00
|
|
|
65-3JN 3 John
|
2020-07-28 21:34:00 +00:00
|
|
|
#66-JUD Jude
|
|
|
|
#67-REV Revelation
|