From 5851b15017e677dee7ab7cb799d6e89fdf6525cc Mon Sep 17 00:00:00 2001 From: Henry Whitney Date: Tue, 11 Aug 2020 17:52:39 -0400 Subject: [PATCH] Tagging ULB --- ...s_from_unified_ULB_XML_and_tWs_and_OGNT.pl | 435 ++++++++++++++++++ 1 file changed, 435 insertions(+) create mode 100644 Construct_auto-tagged_ULB_XML_files_from_unified_ULB_XML_and_tWs_and_OGNT.pl diff --git a/Construct_auto-tagged_ULB_XML_files_from_unified_ULB_XML_and_tWs_and_OGNT.pl b/Construct_auto-tagged_ULB_XML_files_from_unified_ULB_XML_and_tWs_and_OGNT.pl new file mode 100644 index 00000000..d501d5d4 --- /dev/null +++ b/Construct_auto-tagged_ULB_XML_files_from_unified_ULB_XML_and_tWs_and_OGNT.pl @@ -0,0 +1,435 @@ +# Takes current tW entries and populates tagged OGNT XML +# +# This is the current best version +# Requires ULB that includes USFMs. + +# This version maintains the Greek word order to lessen reordering. + + +use 5.12.0; +use File::Slurp; +use File::Find ; +use Cwd ; +use utf8; +#use open IN => ":utf8", OUT => ":utf8"; +use open IO => ":utf8"; +$" = "\n"; + +mkdir "Logs"; +open(LOG, ">Logs/tW_pairs.txt") or die "$!"; +my $ULBfile = "/Users/Henry/Documents/WACS/en_ulb_tagged/ULB_xml/ULB.xml"; +my $topDirOGNT = "/Users/Henry/Documents/WACS/OGNT"; +#my $topDirOGNT = "/Users/Henry/Documents/WACS/en_ulb_tagged/Tag_test"; +my $topDirtW = "/Users/Henry/Documents/WACS/en_tw/bible"; +my ($outDir, $outFile) = ("/Users/Henry/Documents/WACS/en_ulb_tagged/Auto-tagged", ""); +my ($ULBText, $workText, $language); +my ($file); +my (%ULBtextThisVerse, %ULBpreTextThisVerse, %SNsThisVerse, %entriesThisSN, %longName); + +my @OGNTfilesToRun = (); +#my $filePattern = '\.xml' ; +my $filePattern = '41-MAT\.xml' ; +find( sub { push @OGNTfilesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDirOGNT) ; + +say LOG "\@OGNTfilesToRun:\n@OGNTfilesToRun\n"; + +my @tWfilesToRun = (); +$filePattern = '.md' ; +find( sub { push @tWfilesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDirtW) ; + +Read_tW_Files(); + +close LOG; +open(LOG, ">Logs/Log.txt") or die "$!"; + +LongBookNames(); +Prepare_ULB_file(); +say LOG "Prepare_ULB_file done.\n\@OGNTfilesToRun:\n@OGNTfilesToRun\n"; + +ProcessXML(); +# put unused SN at end of verse + +close LOG; + +say "\nDone."; +# ===== +sub Read_tW_Files { + foreach $file ( @tWfilesToRun ) { + say LOG $file; + my (@sns); + my $entries; + my $fileText = read_file("$file", binmode => 'utf8'); + if ($fileText =~ /\* Strong's: ([^\n]*)\n/) { + my $sns = $1; + #say LOG "\t$sns"; + @sns = split /, /, $sns; + } + if ($fileText =~ /Forms Found in the English ULB:\n\n([^\n]*)\n/) { + $entries = $1; + die "$fileText" if $entries eq ""; + #say LOG "\t\t$entries" + } + foreach my $sn (@sns) { + $entriesThisSN{$sn} .= $entries . ", "; + #say LOG "\t\t\t$sn: $entriesThisSN{$sn}" + } + } + foreach my $sn (sort keys %entriesThisSN) { + #say LOG "$sn: $entriesThisSN{$sn}"; + my @entries = split /, /, $entriesThisSN{$sn}; + @entries = reverse sort { substr($a,0,1) <=> substr($b,0,1) + || length($a) <=> length($b) + || $a <=> $b } + @entries; + $entriesThisSN{$sn} = ""; + foreach my $slice (@entries) { + $entriesThisSN{$sn} .= "$slice, " + } + $entriesThisSN{$sn} =~ s/, $//; + say LOG "$sn: $entriesThisSN{$sn}"; + } + +} +sub LongBookNames { + while () { + chomp; + if (/([^\t]*)\t([^\t]*)\t([^\t]*)/) { + $longName{$2} = $3 + } + } +} +sub ProcessXML { + foreach my $file (@OGNTfilesToRun) { + my $greekText; + my $fileGist; + if ($file =~ /((..)....\.xml)/) { + ($fileGist, $language) = ($1, $2); + if ($language > 40) { + $language = "G" + } else {$language = "H"} + + } + say LOG "<0>\t$file \t$fileGist"; + open(OUT, ">$outDir/$fileGist") or die "$outDir/$fileGist: $!"; + my ($pre, $gist, $post, $bk, $ch, $vs, $thisVerse, $staticText, $residueText, $matchedLines, $flag, $thisVerseForOutput, + $linesWithRelevantSNs, $linesNotMatched, $orderedOutputLines, $linesToSkip, $thisPreText); + open (my $thisFile, "<:utf8", "$file") or die "$file:\n$!"; + my ($originalLinesCount, $rsnCount, $skipCount, $noRSNCount, $outCount); + while (my $thisLine = <$thisFile>) { + chomp $thisLine; + if ($thisLine =~ //) { + say LOG "<0.1>\t$thisLine"; + say LOG "<11>\n\$linesWithRelevantSNs\n$linesWithRelevantSNs\n\$linesToSkip\n$linesToSkip\$residueText\n$residueText"; + + ($matchedLines, $residueText, $linesNotMatched) = ProcessRelevantSNs($linesWithRelevantSNs, $staticText, $residueText); + say LOG "<14>\t\$matchedLines\n$matchedLines\n\$linesNotMatched\n$linesNotMatched"; + my %orderedLine; + $matchedLines =~ s/\n{2,}/\n/gs; + say LOG "<15\tBefore sort of \$matchedLines:\n$matchedLines\n"; + while ($matchedLines =~ /([^◊]*)◊(\d*)\n/g) { + $orderedLine{$2} = $1; + say LOG "<5>\t\$2: $2\t\$1: $1"; + } + $matchedLines = ""; + foreach my $line (sort {$a <=> $b} keys %orderedLine) { + say LOG "<5.5>\t\$line: $line\t\$orderedLine{$line}: $orderedLine{$line}"; + $matchedLines .= "$orderedLine{$line}\n" + } + chomp $matchedLines; + say LOG "<16>\tAfter sort of \$matchedLines:\n$matchedLines\n\$linesNotMatched\n$linesNotMatched"; + $residueText =~ s/.*?<\/usfm>//g; + $residueText =~ s/(^q | q$)//g; + $residueText =~ s/ {3,}/ /g; + $residueText =~ s/^ +//; + $residueText =~ s/ +$/$1/; + $greekText =~ s/^ +//; + $greekText =~ s/ +$/$1/; + $staticText =~ s/^ +//; + $staticText =~ s/ +$/$1/; + my $internalUSFM; + $internalUSFM .= "\t\t\t\t\t$&\n" while ($staticText =~ /.*?<\/usfm>/g); + $linesNotMatched =~ s/\n+$//; + $linesToSkip =~ s/\n+$//; + $matchedLines =~ s/^\n+//; + $internalUSFM =~ s/\n+$//; + $internalUSFM =~ s/\t{5,}/\t\t\t\t/g; + say LOG "<17>\tAfter pruning \$matchedLines:\n$matchedLines\n\$linesNotMatched\n$linesNotMatched"; + say OUT "\t\t\t\t$greekText"; + say OUT "\t\t\t\t$thisPreText"; + say OUT "\t\t\t\t$staticText"; + say OUT "\t\t\t\t$residueText"; + say LOG "\$matchedLines:\n$matchedLines \$linesNotMatched:\n$linesNotMatched \$linesToSkip:\n$linesToSkip "; + my $wordLines = RestoreGreekOrder($matchedLines, $linesNotMatched, $linesToSkip); + #say OUT "$matchedLines" unless ($matchedLines eq ""); + #say OUT "$linesNotMatched" unless ($linesNotMatched eq ""); + #say OUT "$internalUSFM" unless ($internalUSFM eq ""); + #say OUT "$linesToSkip" if ($linesToSkip); + $wordLines =~ s/\t$thisLine"; + $originalLinesCount ++; + if ($thisLine =~ />([^\n<>]*)([^<]*)(<\/w>)/$1 text="$2">$3/; + if ($thisLine =~ /lemma="(\d+)"/) { + my $thisLemma = $language . $1; + if (exists $entriesThisSN{$thisLemma}) { + $rsnCount ++; + $linesWithRelevantSNs .= $thisLine . "\n"; + say LOG "<0.2.1>\t\$thisLemma: $thisLemma; line pushed to \$linesWithRelevantSNs"; + } + else { + $skipCount ++; + $thisLine =~ s/><\/w>/>√<\/w>/; + $linesToSkip .= "$thisLine\n"; + #say LOG "<0.2.2>\t\$thisLemma: $thisLemma; line pushed to \@LinesToSkip"; + } + } + } + elsif ($thisLine =~ //) { + say LOG "<0.3>\t$thisLine"; + ($bk, $ch, $vs) = ($1,$2,$3); + ($thisVerse, $greekText) = ("$longName{$bk} $ch:$vs", ""); + $staticText = $ULBtextThisVerse{$thisVerse}; + $residueText = "q $staticText q"; + $thisPreText = $ULBpreTextThisVerse{$thisVerse}; + say OUT "\t\t\t"; + ($flag) = (1); + } + else {say OUT $thisLine} + } + + close $thisFile; + close OUT; + } +} +sub RestoreGreekOrder { + my ($matchedLines, $linesNotMatched, $linesToSkip) = (@_); + say LOG "<00>\t\$matchedLines: $matchedLines \$linesNotMatched: $linesNotMatched \$linesToSkip: $linesToSkip "; + my $wordsLine; + my %order; + foreach my $thisOne ($matchedLines, $linesNotMatched, $linesToSkip) { + say LOG "\$thisOne:\n$thisOne"; + while ($thisOne =~ /(]*>[^<]*<\/w>)/g) { + $order{$2} = "$1\n"; + } + } + foreach my $line (sort keys %order) { + $wordsLine .= $order{$line} + } + say LOG "<01>\t\$wordsLine:>>\n$wordsLine<<"; + return $wordsLine +} +sub ProcessRelevantSNs { + my ($relevantLines, $staticText, $residueText, $linesNotMatched) = (@_); + my ($matchedLines, $thisLine); + my @relevantLines = split /\n/, $relevantLines; + foreach my $line (@relevantLines) { + if ($line =~ /lemma="(\d+)"/) { + my $thisSN = $language . $1; + say LOG "<12>\t\$line: $line, \$thisSN: $thisSN, \$entriesThisSN{$thisSN}\n$entriesThisSN{$thisSN}"; + ($thisLine, $residueText, $linesNotMatched) = MatchAndPlace($line, $thisSN, $staticText, $residueText, $linesNotMatched); + $thisLine =~ s/[ \t]+$//; + $matchedLines .= $thisLine . "\n"; + $matchedLines =~ s/\n{2,}$/\n/s; + say LOG "<13>\t\$matchedLines\n$matchedLines\n\$linesNotMatched\n$linesNotMatched+++" + } + } + return ($matchedLines, $residueText, $linesNotMatched); +} +sub MatchAndPlace { + my ($line, $sn, $staticText, $workText, $linesNotMatched) = @_; + #say LOG "<8>\t\$line: $line \$sn: $sn \$workText\n$workText"; + my ($workEntry, $found, $matchedLines, $first, $second, $third, $firstLen, $secondLen, $thirdLen); + my @entries = split /, /, $entriesThisSN{$sn}; + foreach my $entry (@entries) { + my $entryType; + if ($entry =~ /^(.*) \.\.\. (.*) \.\.\. (.*)$/) { + ($first, $second, $third) = ($1, $2, $3); + ($firstLen, $secondLen, $thirdLen) = (length $first, length $second, length $third); + $workEntry = "\\b" . $first . "\\b" . ".*?" . "\\b" . $second . "\\b" . ".*?" . "\\b" . $third; + say LOG "<1a>\t\$first: $first, \$second: $second, \$third: $third, \$firstLen: $firstLen, \$secondLen,: $secondLen, \$thirdLen: $thirdLen \$entry: |$entry|\t\$workEntry: |$workEntry|"; + $entryType = 1; + } + elsif ($entry =~ /^(.*) \.\.\. (.*)$/) { + ($first, $second) = ($1, $2); + ($firstLen, $secondLen) = (length $first, length $second); + $workEntry = "\\b" . $first . "\\b" . ".*?" . "\\b" . $second . "\\b"; + say LOG "<2a>\t\$first: $first, \$second: $second, \$third: $third, \$firstLen: $firstLen, \$secondLen,: $secondLen, \$entry: |$entry|\t\$workEntry: |$workEntry|"; + $entryType = 2; + } + else {$workEntry = $entry;} + + my $foundText; + #say LOG "<8.1>\t\$entryType: $entryType\t\$entry: $entry\t\$workEntry: $workEntry"; + if ($workText =~ /\b$workEntry\b/p) { + say LOG "<8.1>Found: \t\$entryType: $entryType\t\$entry: $entry\t\$workEntry: $workEntry"; + ($foundText, $workText) = ($&, "${^PREMATCH}ı${^POSTMATCH}"); + my ($place, $foundTextLength, $replacementSpaces) = (length ${^PREMATCH}, length $foundText, ""); + $line =~ s/>$entry\n\$workText,: $workText, \$matchedLines:\n$matchedLines "; + + ($workText) = FixWorkText($line, $workText, $workEntry, $foundText, $foundTextLength, $first, $firstLen, $second, $secondLen, $third, $thirdLen); + + say LOG "<8.3>\n\$workText:\n$workText\n\$matchedLines:\n$matchedLines"; + + } + + else {$workText =~ s/ı/$replacementSpaces/;} + + $matchedLines .= "$line◊$place"; + + say LOG "<8.4>\tAfter found, new \$workText:\n$workText"; + $found = 1; + } + else { + #say LOG "\$workEntry $workEntry not found" + } + if ($found) { + last + } + } + unless ($found) { + $line =~ s/>?\$text:\n$text\n\t\t\$entry: $entry \$foundText: $foundText\t \$foundTextLength: $foundTextLength\t\$first: $first\t\$second: $second\t\$third: $third\n\$firstSpace: $firstSpace\t\$secondSpace: $secondSpace\t\$thirdSpace: $thirdSpace"; + if ($third) { + if ($foundText =~ /$first(.*)$second(.*)$third/) { + my ($firstGap, $secondGap) = ($1, $2); + my $repText = "$firstSpace$firstGap$secondSpace$secondGap$thirdSpace"; + say LOG "<9.1> \$repText: $repText"; + $text =~ s/ı/$repText/; + } + } + else { + if ($foundText =~ /$first(.*)$second/) { + my ($firstGap) = ($1); + say LOG "<9.2>\t\$firstSpace: |$firstSpace|\t\$firstGap: |$firstGap|\t\$secondSpace: |$secondSpace|"; + my $repText ="$firstSpace$firstGap$secondSpace"; + say LOG "<9.3> \$repText: |$repText|"; + $text =~ s/ı/$repText/; + } + } + return ($text) +} +sub Prepare_ULB_file { + + my $thisVerse; + #$ULBText = read_file($ULBfile, binmode => 'utf8'); + + #while ($ULBText =~ /\n(.*?)<\/preText>\n.*(.*?)<\/text>.*<\/verse>/sg) { + # ($ULBtextThisVerse{$1}, $ULBpreTextThisVerse{$1}) = ($3, $2); + #} + # + + open (my $file, "<:utf8", "$ULBfile") or die "$ULBfile:\n$!"; + + while (my $thisLine = <$file>) { + chomp $thisLine; + if ($thisLine =~ /verse name="(.*?)"/) { + $thisVerse = $1; + #say LOG "$thisVerse:\n$thisLine" + } + elsif ($thisLine =~ /(.*?)<\/preText>/) { + $ULBpreTextThisVerse{$thisVerse} = $1; + #say LOG "$thisVerse:\n$ULBpreTextThisVerse{$thisVerse}" + } + elsif ($thisLine =~ /(.*?)<\/text>/) { + $ULBtextThisVerse{$thisVerse} = $1; + #say LOG "$thisVerse:\n$ULBtextThisVerse{$thisVerse}" + } + } + + close $file; +} + +__DATA__ +01 gen Genesis +02 exo Exodus +03 lev Leviticus +04 num Numbers +05 deu Deuteronomy +06 jos Joshua +07 jdg Judges +08 rut Ruth +09 1sa 1 Samuel +10 2sa 2 Samuel +11 1ki 1 Kings +12 2ki 2 Kings +13 1ch 1 Chronicles +14 2ch 2 Chronicles +15 ezr Ezra +16 neh Nehemiah +17 est Esther +18 job Job +19 psa Psalms +20 pro Proverbs +21 ecc Ecclesiastes +22 sng Song of Solomon +23 isa Isaiah +24 jer Jeremiah +25 lam Lamentations +26 ezk Ezekiel +27 dan Daniel +28 hos Hosea +29 jol Joel +30 amo Amos +31 oba Obadiah +32 jon Jonah +33 mic Micah +34 nam Nahum +35 hab Habakkuk +36 zep Zephaniah +37 hag Haggai +38 zec Zechariah +39 mal Malachi +41 mat Matthew +42 mrk Mark +43 luk Luke +44 jhn John +45 act Acts +46 rom Romans +47 1co 1 Corinthians +48 2co 2 Corinthians +49 gal Galatians +50 eph Ephesians +51 php Philippians +52 col Colossians +53 1th 1 Thessalonians +54 2th 2 Thessalonians +55 1ti 1 Timothy +56 2ti 2 Timothy +57 tit Titus +58 phm Philemon +59 heb Hebrews +60 jas James +61 1pe 1 Peter +62 2pe 2 Peter +63 1jn 1 John +64 2jn 2 John +65 3jn 3 John +66 jud Jude +67 rev Revelation