# Checks ULB in ULB.xml against tagged ULB use 5.18.0; use File::Slurp; use File::Find ; use Cwd ; use utf8; #use open IN => ":utf8", OUT => ":utf8"; use open IO => ":utf8"; open(LOG, ">Logs/Log.txt") or die "$!"; open(OUT1, ">Output/Standard.txt") or die "$!"; open(OUT2, ">Output/Tagged.txt") or die "$!"; my ($ULBxml, $taggedULBDir) = ("/Users/dillardfam/Documents/WA/WACS/fork/ULB_xml/ULB.xml", "/Users/dillardfam/Documents/WA/WACS/fork/Manual_Tagging"); my (@filesToRun) = (); my %fullName; my $filePattern = "\55-1TI.xml" ; my $file; my $xmlText = read_file("$ULBxml", binmode => 'utf8'); GetBooksToCheck(); Compare(); sub Compare { foreach my $file (@filesToRun) { say LOG "|$file|, |$taggedULBDir/$file.xml|"; my $taggedText = read_file("$taggedULBDir/$file.xml", binmode => 'utf8'); GetGist($file, $taggedText); } } sub GetGist { my ($fileName, $wholeTaggedText) = @_; my ($verseRef, $standard, $tagged); say LOG "|$fileName|, |$fullName{$fileName}|"; # while ($wholeTaggedText =~ /((.|\n)*?)(.*?)<\/preText>((.|\n)*?)\n\t+((/sg) { while ($wholeTaggedText =~ /((.|\n)*?)(.*?)<\/preText>((.|\n)*?)\n\t+((<(w|usfm)(>| )((.|\n)*?)\n)*)\t+<\/verse>/sg) { my ($preText, $gist) = ($4, $7); $gist =~ s/.*?<\/comment>//sg; $verseRef = $1; if ($xmlText =~ /\n\t+([^\n]*)<\/preText>\n\t+([^\n]*)<\/text>\n\t+<\/verse>/s) { my ($standardPT, $standardT) = ($1, $2); ($tagged) = Untag($preText, $gist); #say LOG $tagged; $standard = $standardPT . " " . $standardT; $standard =~ s/<[^<>]*>//g; $standard =~ s/ {2,}/ /g; $standard =~ s/ +$//; if ($standard ne $tagged) { say LOG "\n$verseRef\nMISMATCH:\n\$standard\n$standard\n\$tagged\n$tagged\n"; say OUT1 $standard; say OUT2 $tagged; } } } } sub Untag { my ($pre, $txt) = ($_[0], $_[1]); say LOG "<00>\t\$pre: $pre\n\$txt: $txt"; if ($txt =~ /\[\d\]/) { $txt = Reorder($txt); say LOG "<0>\t$txt"; } while ($txt =~ s/\n(\t*]*>[^<]*<\/w>\n)*\t*([^<]*)<\/phraseWords>\n\t*<\/phrase>/$2<\/w>/sg) { my $phraseWords = $2; #$txt = Phrase($txt); say LOG "<0a>\$phraseWords: $phraseWords" } $txt =~ s/[\t\n]/ /g; $txt =~ s/(√|<[^<>]*>)//g; $txt = $pre . " " . $txt; $txt =~ s/ {2,}/ /g; $txt =~ s/^(.+[^ ])(\\)/$1 $2/g; $txt =~ s/ +$//; $txt =~ s/— +/—/g; # say LOG "<1>\t$txt"; return $txt; } sub Reorder { my $txt = $_[0]; say LOG "\t$txt"; while ($txt =~ s/sub="\[(\d+)\]" ?([^>]*>)([^<]*)(<(.|\n)*?>[^<]*)\[\1\]([^<]*<)/$2√$4$3$6/s) {} say LOG "\t$txt"; return $txt } sub GetBooksToCheck { while () { chomp; unless (/^#/) { if (/([^\t]*)\t([^\t]*)/) { my ($file, $book) = ($1, $2); say "|$file|"; push @filesToRun, "$file"; $fullName{$file} = $book; } } } } close OUT1; close OUT2; close LOG; say "\nDone."; # ===== __DATA__ #41-MAT Matthew #42-MRK Mark #43-LUK Luke #44-JHN John #45-ACT Acts #46-ROM Romans #47-1CO 1 Corinthians #48-2CO 2 Corinthians #49-GAL Galatians #50-EPH Ephesians #51-PHP Philippians #52-COL Colossians #53-1TH 1 Thessalonians #54-2TH 2 Thessalonians #55-1TI 1 Timothy #56-2TI 2 Timothy #57-TIT Titus #58-PHM Philemon #59-HEB Hebrews #60-JAS James #61-1PE 1 Peter #62-2PE 2 Peter #63-1JN 1 John #64-2JN 2 John 65-3JN 3 John #66-JUD Jude #67-REV Revelation