# Checks ULB in ULB.xml against tagged ULB use 5.18.0; use File::Slurp; use File::Find ; use Cwd ; use utf8; #use open IN => ":utf8", OUT => ":utf8"; use open IO => ":utf8"; open(LOG, ">Logs/Log.txt") or die "$!"; open(OUT1, ">Output/Standard.txt") or die "$!"; open(OUT2, ">Output/Tagged.txt") or die "$!"; my ($ULBxml, $taggedULBDir) = ("/Users/Henry/Documents/WACS/en_ulb_tagged/ULB_xml/ULB.xml", "/Users/Henry/Documents/WACS/en_ulb_tagged/Manual_Tagging"); my (@filesToRun) = (); my %fullName; my $filePattern = "\.xml" ; my $file; my $xmlText = read_file("$ULBxml", binmode => 'utf8'); GetBooksToCheck(); Compare(); sub Compare { foreach my $file (@filesToRun) { say LOG "|$file|, |$taggedULBDir/$file.xml|"; my $taggedText = read_file("$taggedULBDir/$file.xml", binmode => 'utf8'); GetGist($file, $taggedText); } } sub GetGist { my ($fileName, $wholeTaggedText) = @_; my ($verseRef, $standard, $tagged); say LOG "|$fileName|, |$fullName{$fileName}|"; while ($wholeTaggedText =~ /((.|\n)*?)(.*?)<\/preText>((.|\n)*?)\n\t+((/sg) { my ($preText, $gist) = ($4, $7); $verseRef = $1; if ($xmlText =~ /\n\t+([^\n]*)<\/preText>\n\t+([^\n]*)<\/text>\n\t+<\/verse>/s) { my ($standardPT, $standardT) = ($1, $2); ($tagged) = Untag($preText, $gist); #say LOG $tagged; $standard = $standardPT . " " . $standardT; $standard =~ s/<[^<>]*>//g; $standard =~ s/ {2,}/ /g; $standard =~ s/ +$//; if ($standard ne $tagged) { say LOG "\n$verseRef\nMISMATCH:\n\$standard\n$standard\n\$tagged\n$tagged\n"; say OUT1 $standard; say OUT2 $tagged; } } } } sub Untag { my ($pre, $txt) = ($_[0], $_[1]); #say LOG "\$pre: $pre\n\$txt: $txt"; $txt =~ s/[\t\n]/ /g; $txt =~ s/(√|<[^<>]*>)//g; $txt = $pre . " " . $txt; # say LOG "<0>\t$txt"; $txt =~ s/ {2,}/ /g; $txt =~ s/— /—/g; $txt =~ s/^(.+[^ ])(\\)/$1 $2/g; $txt =~ s/ +$//; # say LOG "<1>\t$txt"; return $txt; } sub GetBooksToCheck { while () { chomp; unless (/^#/) { if (/([^\t]*)\t([^\t]*)/) { my ($file, $book) = ($1, $2); say "|$file|"; push @filesToRun, "$file"; $fullName{$file} = $book; } } } } close OUT1; close OUT2; close LOG; say "\nDone."; # ===== __DATA__ #41-MAT Matthew #42-MRK Mark #43-LUK Luke #44-JHN John #45-ACT Acts #46-ROM Romans #47-1CO 1 Corinthians #48-2CO 2 Corinthians #49-GAL Galatians #50-EPH Ephesians #51-PHP Philippians 52-COL Colossians #53-1TH 1 Thessalonians #54-2TH 2 Thessalonians #55-1TI 1 Timothy #56-2TI 2 Timothy #57-TIT Titus #58-PHM Philemon #59-HEB Hebrews #60-JAS James #61-1PE 1 Peter #62-2PE 2 Peter #63-1JN 1 John #64-2JN 2 John #65-3JN 3 John #66-JUD Jude #67-REV Revelation