# Takes current tW entries and populates tagged OGNT XML # # This is the current best version # Requires ULB that includes USFMs. # This version maintains the Greek word order to lessen reordering. use 5.12.0; use File::Slurp; use File::Find ; use Cwd ; use utf8; #use open IN => ":utf8", OUT => ":utf8"; use open IO => ":utf8"; $" = "\n"; mkdir "Logs"; open(LOG, ">Logs/tW_pairs.txt") or die "$!"; my $ULBfile = "/Users/dillardfam/Documents/WACS/fork/ULB_xml/ULB.xml"; my $topDirOGNT = "/Users/dillardfam/Documents/WACS/OGNT"; #my $topDirOGNT = "/Users/dillardfam/Documents/WACS/fork/Tag_test"; my $topDirtW = "/Users/dillardfam/Documents/WACS/en_tw/bible"; my ($outDir, $outFile) = ("/Users/dillardfam/Documents/WACS/fork/Auto-tagged", ""); my ($ULBText, $workText, $language); my ($file); my (%ULBtextThisVerse, %ULBpreTextThisVerse, %SNsThisVerse, %entriesThisSN, %longName); my @OGNTfilesToRun = (); #my $filePattern = '\.xml' ; my $filePattern = '43-LUK\.xml' ; find( sub { push @OGNTfilesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDirOGNT) ; say LOG "\@OGNTfilesToRun:\n@OGNTfilesToRun\n"; my @tWfilesToRun = (); $filePattern = '.md' ; find( sub { push @tWfilesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDirtW) ; Read_tW_Files(); close LOG; open(LOG, ">Logs/Log.txt") or die "$!"; LongBookNames(); Prepare_ULB_file(); say LOG "Prepare_ULB_file done.\n\@OGNTfilesToRun:\n@OGNTfilesToRun\n"; ProcessXML(); # put unused SN at end of verse close LOG; say "\nDone."; # ===== sub Read_tW_Files { foreach $file ( @tWfilesToRun ) { say LOG $file; my (@sns); my $entries; my $fileText = read_file("$file", binmode => 'utf8'); if ($fileText =~ /\* Strong's: ([^\n]*)\n/) { my $sns = $1; #say LOG "\t$sns"; @sns = split /, /, $sns; } if ($fileText =~ /Forms Found in the English ULB:\n\n([^\n]*)\n/) { $entries = $1; die "$fileText" if $entries eq ""; #say LOG "\t\t$entries" } foreach my $sn (@sns) { $entriesThisSN{$sn} .= $entries . ", "; #say LOG "\t\t\t$sn: $entriesThisSN{$sn}" } } foreach my $sn (sort keys %entriesThisSN) { #say LOG "$sn: $entriesThisSN{$sn}"; my @entries = split /, /, $entriesThisSN{$sn}; @entries = reverse sort { substr($a,0,1) <=> substr($b,0,1) || length($a) <=> length($b) || $a <=> $b } @entries; $entriesThisSN{$sn} = ""; foreach my $slice (@entries) { $entriesThisSN{$sn} .= "$slice, " } $entriesThisSN{$sn} =~ s/, $//; say LOG "$sn: $entriesThisSN{$sn}"; } } sub LongBookNames { while () { chomp; if (/^([^#][^\t]*)\t([^\t]*)\t([^\t]*)/) { $longName{$2} = $3 } } } sub ProcessXML { foreach my $file (@OGNTfilesToRun) { my $greekText; my $fileGist; if ($file =~ /((..)....\.xml)/) { ($fileGist, $language) = ($1, $2); if ($language > 40) { $language = "G" } else {$language = "H"} } say LOG "<0>\t$file \t$fileGist"; open(OUT, ">$outDir/$fileGist") or die "$outDir/$fileGist: $!"; my ($pre, $gist, $post, $bk, $ch, $vs, $thisVerse, $staticText, $residueText, $matchedLines, $flag, $thisVerseForOutput, $linesWithRelevantSNs, $linesNotMatched, $orderedOutputLines, $linesToSkip, $thisPreText); open (my $thisFile, "<:utf8", "$file") or die "$file:\n$!"; my ($originalLinesCount, $rsnCount, $skipCount, $noRSNCount, $outCount); while (my $thisLine = <$thisFile>) { chomp $thisLine; if ($thisLine =~ //) { say LOG "<0.1>\t$thisLine"; say LOG "<11>\n\$linesWithRelevantSNs\n$linesWithRelevantSNs\n\$linesToSkip\n$linesToSkip\$residueText\n$residueText"; ($matchedLines, $residueText, $linesNotMatched) = ProcessRelevantSNs($linesWithRelevantSNs, $staticText, $residueText); say LOG "<14>\t\$matchedLines\n$matchedLines\n\$linesNotMatched\n$linesNotMatched"; my %orderedLine; $matchedLines =~ s/\n{2,}/\n/gs; say LOG "<15\tBefore sort of \$matchedLines:\n$matchedLines\n"; while ($matchedLines =~ /([^◊]*)◊(\d*)\n/g) { $orderedLine{$2} = $1; say LOG "<5>\t\$2: $2\t\$1: $1"; } $matchedLines = ""; foreach my $line (sort {$a <=> $b} keys %orderedLine) { say LOG "<5.5>\t\$line: $line\t\$orderedLine{$line}: $orderedLine{$line}"; $matchedLines .= "$orderedLine{$line}\n" } chomp $matchedLines; say LOG "<16>\tAfter sort of \$matchedLines:\n$matchedLines\n\$linesNotMatched\n$linesNotMatched"; $residueText =~ s/.*?<\/usfm>//g; $residueText =~ s/(^q | q$)//g; $residueText =~ s/ {3,}/ /g; $residueText =~ s/^ +//; $residueText =~ s/ +$/$1/; $greekText =~ s/^ +//; $greekText =~ s/ +$/$1/; $staticText =~ s/^ +//; $staticText =~ s/ +$/$1/; my $internalUSFM; $internalUSFM .= "\t\t\t\t\t$&\n" while ($staticText =~ /.*?<\/usfm>/g); $linesNotMatched =~ s/\n+$//; $linesToSkip =~ s/\n+$//; $matchedLines =~ s/^\n+//; $internalUSFM =~ s/\n+$//; $internalUSFM =~ s/\t{5,}/\t\t\t\t/g; say LOG "<17>\tAfter pruning \$matchedLines:\n$matchedLines\n\$linesNotMatched\n$linesNotMatched"; say OUT "\t\t\t\t$greekText"; say OUT "\t\t\t\t$thisPreText"; say OUT "\t\t\t\t$staticText"; say OUT "\t\t\t\t$residueText"; say LOG "\$matchedLines:\n$matchedLines \$linesNotMatched:\n$linesNotMatched \$linesToSkip:\n$linesToSkip "; my $wordLines = RestoreGreekOrder($matchedLines, $linesNotMatched, $linesToSkip); #say OUT "$matchedLines" unless ($matchedLines eq ""); #say OUT "$linesNotMatched" unless ($linesNotMatched eq ""); #say OUT "$internalUSFM" unless ($internalUSFM eq ""); #say OUT "$linesToSkip" if ($linesToSkip); $wordLines =~ s/\t$thisLine"; $originalLinesCount ++; if ($thisLine =~ />([^\n<>]*)([^<]*)(<\/w>)/$1 text="$2">$3/; if ($thisLine =~ /lemma="(\d+)"/) { my $thisLemma = $language . $1; if (exists $entriesThisSN{$thisLemma}) { $rsnCount ++; $linesWithRelevantSNs .= $thisLine . "\n"; say LOG "<0.2.1>\t\$thisLemma: $thisLemma; line pushed to \$linesWithRelevantSNs"; } else { $skipCount ++; $thisLine =~ s/><\/w>/>√<\/w>/; $linesToSkip .= "$thisLine\n"; #say LOG "<0.2.2>\t\$thisLemma: $thisLemma; line pushed to \@LinesToSkip"; } } } elsif ($thisLine =~ //) { say LOG "<0.3>\t$thisLine"; ($bk, $ch, $vs) = ($1,$2,$3); ($thisVerse, $greekText) = ("$longName{$bk} $ch:$vs", ""); $staticText = $ULBtextThisVerse{$thisVerse}; $residueText = "q $staticText q"; $thisPreText = $ULBpreTextThisVerse{$thisVerse}; say OUT "\t\t\t"; ($flag) = (1); } else {say OUT $thisLine} } close $thisFile; close OUT; } } sub RestoreGreekOrder { my ($matchedLines, $linesNotMatched, $linesToSkip) = (@_); say LOG "<00>\t\$matchedLines: $matchedLines \$linesNotMatched: $linesNotMatched \$linesToSkip: $linesToSkip "; my $wordsLine; my %order; foreach my $thisOne ($matchedLines, $linesNotMatched, $linesToSkip) { say LOG "\$thisOne:\n$thisOne"; while ($thisOne =~ /(]*>[^<]*<\/w>)/g) { $order{$2} = "$1\n"; } } foreach my $line (sort keys %order) { $wordsLine .= $order{$line} } say LOG "<01>\t\$wordsLine:>>\n$wordsLine<<"; return $wordsLine } sub ProcessRelevantSNs { my ($relevantLines, $staticText, $residueText, $linesNotMatched) = (@_); my ($matchedLines, $thisLine); my @relevantLines = split /\n/, $relevantLines; foreach my $line (@relevantLines) { if ($line =~ /lemma="(\d+)"/) { my $thisSN = $language . $1; say LOG "<12>\t\$line: $line, \$thisSN: $thisSN, \$entriesThisSN{$thisSN}\n$entriesThisSN{$thisSN}"; ($thisLine, $residueText, $linesNotMatched) = MatchAndPlace($line, $thisSN, $staticText, $residueText, $linesNotMatched); $thisLine =~ s/[ \t]+$//; $matchedLines .= $thisLine . "\n"; $matchedLines =~ s/\n{2,}$/\n/s; say LOG "<13>\t\$matchedLines\n$matchedLines\n\$linesNotMatched\n$linesNotMatched+++" } } return ($matchedLines, $residueText, $linesNotMatched); } sub MatchAndPlace { my ($line, $sn, $staticText, $workText, $linesNotMatched) = @_; #say LOG "<8>\t\$line: $line \$sn: $sn \$workText\n$workText"; my ($workEntry, $found, $matchedLines, $first, $second, $third, $firstLen, $secondLen, $thirdLen); my @entries = split /, /, $entriesThisSN{$sn}; foreach my $entry (@entries) { my $entryType; if ($entry =~ /^(.*) \.\.\. (.*) \.\.\. (.*)$/) { ($first, $second, $third) = ($1, $2, $3); ($firstLen, $secondLen, $thirdLen) = (length $first, length $second, length $third); $workEntry = "\\b" . $first . "\\b" . ".*?" . "\\b" . $second . "\\b" . ".*?" . "\\b" . $third; say LOG "<1a>\t\$first: $first, \$second: $second, \$third: $third, \$firstLen: $firstLen, \$secondLen,: $secondLen, \$thirdLen: $thirdLen \$entry: |$entry|\t\$workEntry: |$workEntry|"; $entryType = 1; } elsif ($entry =~ /^(.*) \.\.\. (.*)$/) { ($first, $second) = ($1, $2); ($firstLen, $secondLen) = (length $first, length $second); $workEntry = "\\b" . $first . "\\b" . ".*?" . "\\b" . $second . "\\b"; say LOG "<2a>\t\$first: $first, \$second: $second, \$third: $third, \$firstLen: $firstLen, \$secondLen,: $secondLen, \$entry: |$entry|\t\$workEntry: |$workEntry|"; $entryType = 2; } else {$workEntry = $entry;} my $foundText; #say LOG "<8.1>\t\$entryType: $entryType\t\$entry: $entry\t\$workEntry: $workEntry"; if ($workText =~ /\b$workEntry\b/p) { say LOG "<8.1>Found: \t\$entryType: $entryType\t\$entry: $entry\t\$workEntry: $workEntry"; ($foundText, $workText) = ($&, "${^PREMATCH}ı${^POSTMATCH}"); my ($place, $foundTextLength, $replacementSpaces) = (length ${^PREMATCH}, length $foundText, ""); $line =~ s/>$entry\n\$workText,: $workText, \$matchedLines:\n$matchedLines "; ($workText) = FixWorkText($line, $workText, $workEntry, $foundText, $foundTextLength, $first, $firstLen, $second, $secondLen, $third, $thirdLen); say LOG "<8.3>\n\$workText:\n$workText\n\$matchedLines:\n$matchedLines"; } else {$workText =~ s/ı/$replacementSpaces/;} $matchedLines .= "$line◊$place"; say LOG "<8.4>\tAfter found, new \$workText:\n$workText"; $found = 1; } else { #say LOG "\$workEntry $workEntry not found" } if ($found) { last } } unless ($found) { $line =~ s/>√\$text:\n$text\n\t\t\$entry: $entry \$foundText: $foundText\t \$foundTextLength: $foundTextLength\t\$first: $first\t\$second: $second\t\$third: $third\n\$firstSpace: $firstSpace\t\$secondSpace: $secondSpace\t\$thirdSpace: $thirdSpace"; if ($third) { if ($foundText =~ /$first(.*)$second(.*)$third/) { my ($firstGap, $secondGap) = ($1, $2); my $repText = "$firstSpace$firstGap$secondSpace$secondGap$thirdSpace"; say LOG "<9.1> \$repText: $repText"; $text =~ s/ı/$repText/; } } else { if ($foundText =~ /$first(.*)$second/) { my ($firstGap) = ($1); say LOG "<9.2>\t\$firstSpace: |$firstSpace|\t\$firstGap: |$firstGap|\t\$secondSpace: |$secondSpace|"; my $repText ="$firstSpace$firstGap$secondSpace"; say LOG "<9.3> \$repText: |$repText|"; $text =~ s/ı/$repText/; } } return ($text) } sub Prepare_ULB_file { my $thisVerse; #$ULBText = read_file($ULBfile, binmode => 'utf8'); #while ($ULBText =~ /\n(.*?)<\/preText>\n.*(.*?)<\/text>.*<\/verse>/sg) { # ($ULBtextThisVerse{$1}, $ULBpreTextThisVerse{$1}) = ($3, $2); #} # open (my $file, "<:utf8", "$ULBfile") or die "$ULBfile:\n$!"; while (my $thisLine = <$file>) { chomp $thisLine; if ($thisLine =~ /verse name="(.*?)"/) { $thisVerse = $1; #say LOG "$thisVerse:\n$thisLine" } elsif ($thisLine =~ /(.*?)<\/preText>/) { $ULBpreTextThisVerse{$thisVerse} = $1; #say LOG "$thisVerse:\n$ULBpreTextThisVerse{$thisVerse}" } elsif ($thisLine =~ /(.*?)<\/text>/) { $ULBtextThisVerse{$thisVerse} = $1; #say LOG "$thisVerse:\n$ULBtextThisVerse{$thisVerse}" } } close $file; } __DATA__ 01 gen Genesis 02 exo Exodus 03 lev Leviticus 04 num Numbers 05 deu Deuteronomy 06 jos Joshua 07 jdg Judges 08 rut Ruth 09 1sa 1 Samuel 10 2sa 2 Samuel 11 1ki 1 Kings 12 2ki 2 Kings 13 1ch 1 Chronicles 14 2ch 2 Chronicles 15 ezr Ezra 16 neh Nehemiah 17 est Esther 18 job Job 19 psa Psalms 20 pro Proverbs 21 ecc Ecclesiastes 22 sng Song of Solomon 23 isa Isaiah 24 jer Jeremiah 25 lam Lamentations 26 ezk Ezekiel 27 dan Daniel 28 hos Hosea 29 jol Joel 30 amo Amos 31 oba Obadiah 32 jon Jonah 33 mic Micah 34 nam Nahum 35 hab Habakkuk 36 zep Zephaniah 37 hag Haggai 38 zec Zechariah 39 mal Malachi 41 mat Matthew 42 mrk Mark 43 luk Luke 44 jhn John 45 act Acts 46 rom Romans 47 1co 1 Corinthians 48 2co 2 Corinthians 49 gal Galatians 50 eph Ephesians 51 php Philippians 52 col Colossians 53 1th 1 Thessalonians 54 2th 2 Thessalonians 55 1ti 1 Timothy 56 2ti 2 Timothy 57 tit Titus 58 phm Philemon 59 heb Hebrews 60 jas James 61 1pe 1 Peter 62 2pe 2 Peter 63 1jn 1 John 64 2jn 2 John 65 3jn 3 John 66 jud Jude 67 rev Revelation