Tagging ULB

This commit is contained in:
Henry Whitney 2020-08-11 17:52:39 -04:00
parent 4985c2e6e0
commit 5851b15017
1 changed files with 435 additions and 0 deletions

View File

@ -0,0 +1,435 @@
# Takes current tW entries and populates tagged OGNT XML
#
# This is the current best version
# Requires ULB that includes USFMs.
# This version maintains the Greek word order to lessen reordering.
use 5.12.0;
use File::Slurp;
use File::Find ;
use Cwd ;
use utf8;
#use open IN => ":utf8", OUT => ":utf8";
use open IO => ":utf8";
$" = "\n";
mkdir "Logs";
open(LOG, ">Logs/tW_pairs.txt") or die "$!";
my $ULBfile = "/Users/Henry/Documents/WACS/en_ulb_tagged/ULB_xml/ULB.xml";
my $topDirOGNT = "/Users/Henry/Documents/WACS/OGNT";
#my $topDirOGNT = "/Users/Henry/Documents/WACS/en_ulb_tagged/Tag_test";
my $topDirtW = "/Users/Henry/Documents/WACS/en_tw/bible";
my ($outDir, $outFile) = ("/Users/Henry/Documents/WACS/en_ulb_tagged/Auto-tagged", "");
my ($ULBText, $workText, $language);
my ($file);
my (%ULBtextThisVerse, %ULBpreTextThisVerse, %SNsThisVerse, %entriesThisSN, %longName);
my @OGNTfilesToRun = ();
#my $filePattern = '\.xml' ;
my $filePattern = '41-MAT\.xml' ;
find( sub { push @OGNTfilesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDirOGNT) ;
say LOG "\@OGNTfilesToRun:\n@OGNTfilesToRun\n";
my @tWfilesToRun = ();
$filePattern = '.md' ;
find( sub { push @tWfilesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDirtW) ;
Read_tW_Files();
close LOG;
open(LOG, ">Logs/Log.txt") or die "$!";
LongBookNames();
Prepare_ULB_file();
say LOG "Prepare_ULB_file done.\n\@OGNTfilesToRun:\n@OGNTfilesToRun\n";
ProcessXML();
# put unused SN at end of verse
close LOG;
say "\nDone.";
# =====
sub Read_tW_Files {
foreach $file ( @tWfilesToRun ) {
say LOG $file;
my (@sns);
my $entries;
my $fileText = read_file("$file", binmode => 'utf8');
if ($fileText =~ /\* Strong's: ([^\n]*)\n/) {
my $sns = $1;
#say LOG "\t$sns";
@sns = split /, /, $sns;
}
if ($fileText =~ /Forms Found in the English ULB:\n\n([^\n]*)\n/) {
$entries = $1;
die "$fileText" if $entries eq "";
#say LOG "\t\t$entries"
}
foreach my $sn (@sns) {
$entriesThisSN{$sn} .= $entries . ", ";
#say LOG "\t\t\t$sn: $entriesThisSN{$sn}"
}
}
foreach my $sn (sort keys %entriesThisSN) {
#say LOG "$sn: $entriesThisSN{$sn}";
my @entries = split /, /, $entriesThisSN{$sn};
@entries = reverse sort { substr($a,0,1) <=> substr($b,0,1)
|| length($a) <=> length($b)
|| $a <=> $b }
@entries;
$entriesThisSN{$sn} = "";
foreach my $slice (@entries) {
$entriesThisSN{$sn} .= "$slice, "
}
$entriesThisSN{$sn} =~ s/, $//;
say LOG "$sn: $entriesThisSN{$sn}";
}
}
sub LongBookNames {
while (<DATA>) {
chomp;
if (/([^\t]*)\t([^\t]*)\t([^\t]*)/) {
$longName{$2} = $3
}
}
}
sub ProcessXML {
foreach my $file (@OGNTfilesToRun) {
my $greekText;
my $fileGist;
if ($file =~ /((..)....\.xml)/) {
($fileGist, $language) = ($1, $2);
if ($language > 40) {
$language = "G"
} else {$language = "H"}
}
say LOG "<0>\t$file \t$fileGist";
open(OUT, ">$outDir/$fileGist") or die "$outDir/$fileGist: $!";
my ($pre, $gist, $post, $bk, $ch, $vs, $thisVerse, $staticText, $residueText, $matchedLines, $flag, $thisVerseForOutput,
$linesWithRelevantSNs, $linesNotMatched, $orderedOutputLines, $linesToSkip, $thisPreText);
open (my $thisFile, "<:utf8", "$file") or die "$file:\n$!";
my ($originalLinesCount, $rsnCount, $skipCount, $noRSNCount, $outCount);
while (my $thisLine = <$thisFile>) {
chomp $thisLine;
if ($thisLine =~ /<verse/) {say LOG "\n=========================="}
if ($thisLine =~ /<\/verse>/) {
say LOG "<0.1>\t$thisLine";
say LOG "<11>\n\$linesWithRelevantSNs\n$linesWithRelevantSNs\n\$linesToSkip\n$linesToSkip\$residueText\n$residueText";
($matchedLines, $residueText, $linesNotMatched) = ProcessRelevantSNs($linesWithRelevantSNs, $staticText, $residueText);
say LOG "<14>\t\$matchedLines\n$matchedLines\n\$linesNotMatched\n$linesNotMatched";
my %orderedLine;
$matchedLines =~ s/\n{2,}/\n/gs;
say LOG "<15\tBefore sort of \$matchedLines:\n$matchedLines\n";
while ($matchedLines =~ /([^◊]*)◊(\d*)\n/g) {
$orderedLine{$2} = $1;
say LOG "<5>\t\$2: $2\t\$1: $1";
}
$matchedLines = "";
foreach my $line (sort {$a <=> $b} keys %orderedLine) {
say LOG "<5.5>\t\$line: $line\t\$orderedLine{$line}: $orderedLine{$line}";
$matchedLines .= "$orderedLine{$line}\n"
}
chomp $matchedLines;
say LOG "<16>\tAfter sort of \$matchedLines:\n$matchedLines\n\$linesNotMatched\n$linesNotMatched";
$residueText =~ s/<usfm>.*?<\/usfm>//g;
$residueText =~ s/(^q | q$)//g;
$residueText =~ s/ {3,}/ /g;
$residueText =~ s/^ +//;
$residueText =~ s/ +$/$1/;
$greekText =~ s/^ +//;
$greekText =~ s/ +$/$1/;
$staticText =~ s/^ +//;
$staticText =~ s/ +$/$1/;
my $internalUSFM;
$internalUSFM .= "\t\t\t\t\t$&\n" while ($staticText =~ /<usfm>.*?<\/usfm>/g);
$linesNotMatched =~ s/\n+$//;
$linesToSkip =~ s/\n+$//;
$matchedLines =~ s/^\n+//;
$internalUSFM =~ s/\n+$//;
$internalUSFM =~ s/\t{5,}/\t\t\t\t/g;
say LOG "<17>\tAfter pruning \$matchedLines:\n$matchedLines\n\$linesNotMatched\n$linesNotMatched";
say OUT "\t\t\t\t<Greek>$greekText</Greek>";
say OUT "\t\t\t\t<preText>$thisPreText</preText>";
say OUT "\t\t\t\t<ULB>$staticText</ULB>";
say OUT "\t\t\t\t<residue>$residueText</residue>";
say LOG "\$matchedLines:\n$matchedLines \$linesNotMatched:\n$linesNotMatched \$linesToSkip:\n$linesToSkip ";
my $wordLines = RestoreGreekOrder($matchedLines, $linesNotMatched, $linesToSkip);
#say OUT "$matchedLines" unless ($matchedLines eq "");
#say OUT "$linesNotMatched" unless ($linesNotMatched eq "");
#say OUT "$internalUSFM" unless ($internalUSFM eq "");
#say OUT "$linesToSkip" if ($linesToSkip);
$wordLines =~ s/<w/\t\t\t\t<w/gs;
$thisLine = "$wordLines\n$internalUSFM\n$thisLine";
$thisLine =~ s/\n{2,}/\n/sg;
say OUT "$thisLine";
($originalLinesCount, $rsnCount, $skipCount, $noRSNCount, $outCount) = ();
($thisVerseForOutput, $flag, $workText, $greekText, $linesNotMatched, $linesToSkip, $residueText, $orderedOutputLines, $linesWithRelevantSNs) = ();
($linesToSkip) = ("");
}
elsif ($thisLine =~ /<w /) {
say LOG "<0.2>\t$thisLine";
$originalLinesCount ++;
if ($thisLine =~ />([^\n<>]*)</) {
$greekText .= $1 . " "
}
$thisLine =~ s/\t(<w .*)>([^<]*)(<\/w>)/$1 text="$2">$3/;
if ($thisLine =~ /lemma="(\d+)"/) {
my $thisLemma = $language . $1;
if (exists $entriesThisSN{$thisLemma}) {
$rsnCount ++;
$linesWithRelevantSNs .= $thisLine . "\n";
say LOG "<0.2.1>\t\$thisLemma: $thisLemma; line pushed to \$linesWithRelevantSNs";
}
else {
$skipCount ++;
$thisLine =~ s/><\/w>/>√<\/w>/;
$linesToSkip .= "$thisLine\n";
#say LOG "<0.2.2>\t\$thisLemma: $thisLemma; line pushed to \@LinesToSkip";
}
}
}
elsif ($thisLine =~ /<verse osisID="(.*?)\.(.*?)\.(.*?)">/) {
say LOG "<0.3>\t$thisLine";
($bk, $ch, $vs) = ($1,$2,$3);
($thisVerse, $greekText) = ("$longName{$bk} $ch:$vs", "");
$staticText = $ULBtextThisVerse{$thisVerse};
$residueText = "q $staticText q";
$thisPreText = $ULBpreTextThisVerse{$thisVerse};
say OUT "\t\t\t<verse name=\"$thisVerse\">";
($flag) = (1);
}
else {say OUT $thisLine}
}
close $thisFile;
close OUT;
}
}
sub RestoreGreekOrder {
my ($matchedLines, $linesNotMatched, $linesToSkip) = (@_);
say LOG "<00>\t\$matchedLines: $matchedLines \$linesNotMatched: $linesNotMatched \$linesToSkip: $linesToSkip ";
my $wordsLine;
my %order;
foreach my $thisOne ($matchedLines, $linesNotMatched, $linesToSkip) {
say LOG "\$thisOne:\n$thisOne";
while ($thisOne =~ /(<w OGNTsort="([^"]*)"[^>]*>[^<]*<\/w>)/g) {
$order{$2} = "$1\n";
}
}
foreach my $line (sort keys %order) {
$wordsLine .= $order{$line}
}
say LOG "<01>\t\$wordsLine:>>\n$wordsLine<<";
return $wordsLine
}
sub ProcessRelevantSNs {
my ($relevantLines, $staticText, $residueText, $linesNotMatched) = (@_);
my ($matchedLines, $thisLine);
my @relevantLines = split /\n/, $relevantLines;
foreach my $line (@relevantLines) {
if ($line =~ /lemma="(\d+)"/) {
my $thisSN = $language . $1;
say LOG "<12>\t\$line: $line, \$thisSN: $thisSN, \$entriesThisSN{$thisSN}\n$entriesThisSN{$thisSN}";
($thisLine, $residueText, $linesNotMatched) = MatchAndPlace($line, $thisSN, $staticText, $residueText, $linesNotMatched);
$thisLine =~ s/[ \t]+$//;
$matchedLines .= $thisLine . "\n";
$matchedLines =~ s/\n{2,}$/\n/s;
say LOG "<13>\t\$matchedLines\n$matchedLines\n\$linesNotMatched\n$linesNotMatched+++"
}
}
return ($matchedLines, $residueText, $linesNotMatched);
}
sub MatchAndPlace {
my ($line, $sn, $staticText, $workText, $linesNotMatched) = @_;
#say LOG "<8>\t\$line: $line \$sn: $sn \$workText\n$workText";
my ($workEntry, $found, $matchedLines, $first, $second, $third, $firstLen, $secondLen, $thirdLen);
my @entries = split /, /, $entriesThisSN{$sn};
foreach my $entry (@entries) {
my $entryType;
if ($entry =~ /^(.*) \.\.\. (.*) \.\.\. (.*)$/) {
($first, $second, $third) = ($1, $2, $3);
($firstLen, $secondLen, $thirdLen) = (length $first, length $second, length $third);
$workEntry = "\\b" . $first . "\\b" . ".*?" . "\\b" . $second . "\\b" . ".*?" . "\\b" . $third;
say LOG "<1a>\t\$first: $first, \$second: $second, \$third: $third, \$firstLen: $firstLen, \$secondLen,: $secondLen, \$thirdLen: $thirdLen \$entry: |$entry|\t\$workEntry: |$workEntry|";
$entryType = 1;
}
elsif ($entry =~ /^(.*) \.\.\. (.*)$/) {
($first, $second) = ($1, $2);
($firstLen, $secondLen) = (length $first, length $second);
$workEntry = "\\b" . $first . "\\b" . ".*?" . "\\b" . $second . "\\b";
say LOG "<2a>\t\$first: $first, \$second: $second, \$third: $third, \$firstLen: $firstLen, \$secondLen,: $secondLen, \$entry: |$entry|\t\$workEntry: |$workEntry|";
$entryType = 2;
}
else {$workEntry = $entry;}
my $foundText;
#say LOG "<8.1>\t\$entryType: $entryType\t\$entry: $entry\t\$workEntry: $workEntry";
if ($workText =~ /\b$workEntry\b/p) {
say LOG "<8.1>Found: \t\$entryType: $entryType\t\$entry: $entry\t\$workEntry: $workEntry";
($foundText, $workText) = ($&, "${^PREMATCH}ı${^POSTMATCH}");
my ($place, $foundTextLength, $replacementSpaces) = (length ${^PREMATCH}, length $foundText, "");
$line =~ s/></>$entry</;
while (length $replacementSpaces < $foundTextLength) {$replacementSpaces .= " "}
if ($entryType) {
say LOG "<8.2>\n\$workText,: $workText, \$matchedLines:\n$matchedLines ";
($workText) = FixWorkText($line, $workText, $workEntry, $foundText, $foundTextLength, $first, $firstLen, $second, $secondLen, $third, $thirdLen);
say LOG "<8.3>\n\$workText:\n$workText\n\$matchedLines:\n$matchedLines";
}
else {$workText =~ s/ı/$replacementSpaces/;}
$matchedLines .= "$line◊$place";
say LOG "<8.4>\tAfter found, new \$workText:\n$workText";
$found = 1;
}
else {
#say LOG "\$workEntry $workEntry not found"
}
if ($found) {
last
}
}
unless ($found) {
$line =~ s/></>?</;
$linesNotMatched .= "$line\n"
}
return ($matchedLines, $workText, $linesNotMatched)
}
sub FixWorkText {
my ($thisLine, $text, $entry, $foundText, $foundTextLength, $first, $firstLen, $second, $secondLen, $third, $thirdLen) = @_;
my ($firstSpace, $secondSpace, $thirdSpace);
while (length $firstSpace < $firstLen) {$firstSpace .= " "}
while (length $secondSpace < $secondLen) {$secondSpace .= " "}
while (length $thirdSpace < $thirdLen) {$thirdSpace .= " "}
say LOG "<9>\$text:\n$text\n\t\t\$entry: $entry \$foundText: $foundText\t \$foundTextLength: $foundTextLength\t\$first: $first\t\$second: $second\t\$third: $third\n\$firstSpace: $firstSpace\t\$secondSpace: $secondSpace\t\$thirdSpace: $thirdSpace";
if ($third) {
if ($foundText =~ /$first(.*)$second(.*)$third/) {
my ($firstGap, $secondGap) = ($1, $2);
my $repText = "$firstSpace$firstGap$secondSpace$secondGap$thirdSpace";
say LOG "<9.1> \$repText: $repText";
$text =~ s/ı/$repText/;
}
}
else {
if ($foundText =~ /$first(.*)$second/) {
my ($firstGap) = ($1);
say LOG "<9.2>\t\$firstSpace: |$firstSpace|\t\$firstGap: |$firstGap|\t\$secondSpace: |$secondSpace|";
my $repText ="$firstSpace$firstGap$secondSpace";
say LOG "<9.3> \$repText: |$repText|";
$text =~ s/ı/$repText/;
}
}
return ($text)
}
sub Prepare_ULB_file {
my $thisVerse;
#$ULBText = read_file($ULBfile, binmode => 'utf8');
#while ($ULBText =~ /<verse name="(.*?)">\n<preText>(.*?)<\/preText>\n.*<text>(.*?)<\/text>.*<\/verse>/sg) {
# ($ULBtextThisVerse{$1}, $ULBpreTextThisVerse{$1}) = ($3, $2);
#}
#
open (my $file, "<:utf8", "$ULBfile") or die "$ULBfile:\n$!";
while (my $thisLine = <$file>) {
chomp $thisLine;
if ($thisLine =~ /verse name="(.*?)"/) {
$thisVerse = $1;
#say LOG "$thisVerse:\n$thisLine"
}
elsif ($thisLine =~ /<preText>(.*?)<\/preText>/) {
$ULBpreTextThisVerse{$thisVerse} = $1;
#say LOG "$thisVerse:\n$ULBpreTextThisVerse{$thisVerse}"
}
elsif ($thisLine =~ /<text>(.*?)<\/text>/) {
$ULBtextThisVerse{$thisVerse} = $1;
#say LOG "$thisVerse:\n$ULBtextThisVerse{$thisVerse}"
}
}
close $file;
}
__DATA__
01 gen Genesis
02 exo Exodus
03 lev Leviticus
04 num Numbers
05 deu Deuteronomy
06 jos Joshua
07 jdg Judges
08 rut Ruth
09 1sa 1 Samuel
10 2sa 2 Samuel
11 1ki 1 Kings
12 2ki 2 Kings
13 1ch 1 Chronicles
14 2ch 2 Chronicles
15 ezr Ezra
16 neh Nehemiah
17 est Esther
18 job Job
19 psa Psalms
20 pro Proverbs
21 ecc Ecclesiastes
22 sng Song of Solomon
23 isa Isaiah
24 jer Jeremiah
25 lam Lamentations
26 ezk Ezekiel
27 dan Daniel
28 hos Hosea
29 jol Joel
30 amo Amos
31 oba Obadiah
32 jon Jonah
33 mic Micah
34 nam Nahum
35 hab Habakkuk
36 zep Zephaniah
37 hag Haggai
38 zec Zechariah
39 mal Malachi
41 mat Matthew
42 mrk Mark
43 luk Luke
44 jhn John
45 act Acts
46 rom Romans
47 1co 1 Corinthians
48 2co 2 Corinthians
49 gal Galatians
50 eph Ephesians
51 php Philippians
52 col Colossians
53 1th 1 Thessalonians
54 2th 2 Thessalonians
55 1ti 1 Timothy
56 2ti 2 Timothy
57 tit Titus
58 phm Philemon
59 heb Hebrews
60 jas James
61 1pe 1 Peter
62 2pe 2 Peter
63 1jn 1 John
64 2jn 2 John
65 3jn 3 John
66 jud Jude
67 rev Revelation