forked from WycliffeAssociates/en_ulb_tagged
436 lines
14 KiB
Perl
436 lines
14 KiB
Perl
# Takes current tW entries and populates tagged OGNT XML
|
||
#
|
||
# This is the current best version
|
||
# Requires ULB that includes USFMs.
|
||
|
||
# This version maintains the Greek word order to lessen reordering.
|
||
|
||
|
||
use 5.12.0;
|
||
use File::Slurp;
|
||
use File::Find ;
|
||
use Cwd ;
|
||
use utf8;
|
||
#use open IN => ":utf8", OUT => ":utf8";
|
||
use open IO => ":utf8";
|
||
$" = "\n";
|
||
|
||
mkdir "Logs";
|
||
open(LOG, ">Logs/tW_pairs.txt") or die "$!";
|
||
my $ULBfile = "/Users/dillardfam/Documents/WACS/fork/ULB_xml/ULB.xml";
|
||
my $topDirOGNT = "/Users/dillardfam/Documents/WACS/OGNT";
|
||
#my $topDirOGNT = "/Users/dillardfam/Documents/WACS/fork/Tag_test";
|
||
my $topDirtW = "/Users/dillardfam/Documents/WACS/en_tw/bible";
|
||
my ($outDir, $outFile) = ("/Users/dillardfam/Documents/WACS/fork/Auto-tagged", "");
|
||
my ($ULBText, $workText, $language);
|
||
my ($file);
|
||
my (%ULBtextThisVerse, %ULBpreTextThisVerse, %SNsThisVerse, %entriesThisSN, %longName);
|
||
|
||
my @OGNTfilesToRun = ();
|
||
#my $filePattern = '\.xml' ;
|
||
my $filePattern = '43-LUK\.xml' ;
|
||
find( sub { push @OGNTfilesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDirOGNT) ;
|
||
|
||
say LOG "\@OGNTfilesToRun:\n@OGNTfilesToRun\n";
|
||
|
||
my @tWfilesToRun = ();
|
||
$filePattern = '.md' ;
|
||
find( sub { push @tWfilesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDirtW) ;
|
||
|
||
Read_tW_Files();
|
||
|
||
close LOG;
|
||
open(LOG, ">Logs/Log.txt") or die "$!";
|
||
|
||
LongBookNames();
|
||
Prepare_ULB_file();
|
||
say LOG "Prepare_ULB_file done.\n\@OGNTfilesToRun:\n@OGNTfilesToRun\n";
|
||
|
||
ProcessXML();
|
||
# put unused SN at end of verse
|
||
|
||
close LOG;
|
||
|
||
say "\nDone.";
|
||
# =====
|
||
sub Read_tW_Files {
|
||
foreach $file ( @tWfilesToRun ) {
|
||
say LOG $file;
|
||
my (@sns);
|
||
my $entries;
|
||
my $fileText = read_file("$file", binmode => 'utf8');
|
||
if ($fileText =~ /\* Strong's: ([^\n]*)\n/) {
|
||
my $sns = $1;
|
||
#say LOG "\t$sns";
|
||
@sns = split /, /, $sns;
|
||
}
|
||
if ($fileText =~ /Forms Found in the English ULB:\n\n([^\n]*)\n/) {
|
||
$entries = $1;
|
||
die "$fileText" if $entries eq "";
|
||
#say LOG "\t\t$entries"
|
||
}
|
||
foreach my $sn (@sns) {
|
||
$entriesThisSN{$sn} .= $entries . ", ";
|
||
#say LOG "\t\t\t$sn: $entriesThisSN{$sn}"
|
||
}
|
||
}
|
||
foreach my $sn (sort keys %entriesThisSN) {
|
||
#say LOG "$sn: $entriesThisSN{$sn}";
|
||
my @entries = split /, /, $entriesThisSN{$sn};
|
||
@entries = reverse sort { substr($a,0,1) <=> substr($b,0,1)
|
||
|| length($a) <=> length($b)
|
||
|| $a <=> $b }
|
||
@entries;
|
||
$entriesThisSN{$sn} = "";
|
||
foreach my $slice (@entries) {
|
||
$entriesThisSN{$sn} .= "$slice, "
|
||
}
|
||
$entriesThisSN{$sn} =~ s/, $//;
|
||
say LOG "$sn: $entriesThisSN{$sn}";
|
||
}
|
||
|
||
}
|
||
sub LongBookNames {
|
||
while (<DATA>) {
|
||
chomp;
|
||
if (/^([^#][^\t]*)\t([^\t]*)\t([^\t]*)/) {
|
||
$longName{$2} = $3
|
||
}
|
||
}
|
||
}
|
||
sub ProcessXML {
|
||
foreach my $file (@OGNTfilesToRun) {
|
||
my $greekText;
|
||
my $fileGist;
|
||
if ($file =~ /((..)....\.xml)/) {
|
||
($fileGist, $language) = ($1, $2);
|
||
if ($language > 40) {
|
||
$language = "G"
|
||
} else {$language = "H"}
|
||
|
||
}
|
||
say LOG "<0>\t$file \t$fileGist";
|
||
open(OUT, ">$outDir/$fileGist") or die "$outDir/$fileGist: $!";
|
||
my ($pre, $gist, $post, $bk, $ch, $vs, $thisVerse, $staticText, $residueText, $matchedLines, $flag, $thisVerseForOutput,
|
||
$linesWithRelevantSNs, $linesNotMatched, $orderedOutputLines, $linesToSkip, $thisPreText);
|
||
open (my $thisFile, "<:utf8", "$file") or die "$file:\n$!";
|
||
my ($originalLinesCount, $rsnCount, $skipCount, $noRSNCount, $outCount);
|
||
while (my $thisLine = <$thisFile>) {
|
||
chomp $thisLine;
|
||
if ($thisLine =~ /<verse/) {say LOG "\n=========================="}
|
||
if ($thisLine =~ /<\/verse>/) {
|
||
say LOG "<0.1>\t$thisLine";
|
||
say LOG "<11>\n\$linesWithRelevantSNs\n$linesWithRelevantSNs\n\$linesToSkip\n$linesToSkip\$residueText\n$residueText";
|
||
|
||
($matchedLines, $residueText, $linesNotMatched) = ProcessRelevantSNs($linesWithRelevantSNs, $staticText, $residueText);
|
||
say LOG "<14>\t\$matchedLines\n$matchedLines\n\$linesNotMatched\n$linesNotMatched";
|
||
my %orderedLine;
|
||
$matchedLines =~ s/\n{2,}/\n/gs;
|
||
say LOG "<15\tBefore sort of \$matchedLines:\n$matchedLines\n";
|
||
while ($matchedLines =~ /([^◊]*)◊(\d*)\n/g) {
|
||
$orderedLine{$2} = $1;
|
||
say LOG "<5>\t\$2: $2\t\$1: $1";
|
||
}
|
||
$matchedLines = "";
|
||
foreach my $line (sort {$a <=> $b} keys %orderedLine) {
|
||
say LOG "<5.5>\t\$line: $line\t\$orderedLine{$line}: $orderedLine{$line}";
|
||
$matchedLines .= "$orderedLine{$line}\n"
|
||
}
|
||
chomp $matchedLines;
|
||
say LOG "<16>\tAfter sort of \$matchedLines:\n$matchedLines\n\$linesNotMatched\n$linesNotMatched";
|
||
$residueText =~ s/<usfm>.*?<\/usfm>//g;
|
||
$residueText =~ s/(^q | q$)//g;
|
||
$residueText =~ s/ {3,}/ /g;
|
||
$residueText =~ s/^ +//;
|
||
$residueText =~ s/ +$/$1/;
|
||
$greekText =~ s/^ +//;
|
||
$greekText =~ s/ +$/$1/;
|
||
$staticText =~ s/^ +//;
|
||
$staticText =~ s/ +$/$1/;
|
||
my $internalUSFM;
|
||
$internalUSFM .= "\t\t\t\t\t$&\n" while ($staticText =~ /<usfm>.*?<\/usfm>/g);
|
||
$linesNotMatched =~ s/\n+$//;
|
||
$linesToSkip =~ s/\n+$//;
|
||
$matchedLines =~ s/^\n+//;
|
||
$internalUSFM =~ s/\n+$//;
|
||
$internalUSFM =~ s/\t{5,}/\t\t\t\t/g;
|
||
say LOG "<17>\tAfter pruning \$matchedLines:\n$matchedLines\n\$linesNotMatched\n$linesNotMatched";
|
||
say OUT "\t\t\t\t<Greek>$greekText</Greek>";
|
||
say OUT "\t\t\t\t<preText>$thisPreText</preText>";
|
||
say OUT "\t\t\t\t<ULB>$staticText</ULB>";
|
||
say OUT "\t\t\t\t<residue>$residueText</residue>";
|
||
say LOG "\$matchedLines:\n$matchedLines \$linesNotMatched:\n$linesNotMatched \$linesToSkip:\n$linesToSkip ";
|
||
my $wordLines = RestoreGreekOrder($matchedLines, $linesNotMatched, $linesToSkip);
|
||
#say OUT "$matchedLines" unless ($matchedLines eq "");
|
||
#say OUT "$linesNotMatched" unless ($linesNotMatched eq "");
|
||
#say OUT "$internalUSFM" unless ($internalUSFM eq "");
|
||
#say OUT "$linesToSkip" if ($linesToSkip);
|
||
$wordLines =~ s/<w/\t\t\t\t<w/gs;
|
||
$thisLine = "$wordLines\n$internalUSFM\n$thisLine";
|
||
$thisLine =~ s/\n{2,}/\n/sg;
|
||
say OUT "$thisLine";
|
||
($originalLinesCount, $rsnCount, $skipCount, $noRSNCount, $outCount) = ();
|
||
($thisVerseForOutput, $flag, $workText, $greekText, $linesNotMatched, $linesToSkip, $residueText, $orderedOutputLines, $linesWithRelevantSNs) = ();
|
||
($linesToSkip) = ("");
|
||
}
|
||
elsif ($thisLine =~ /<w /) {
|
||
say LOG "<0.2>\t$thisLine";
|
||
$originalLinesCount ++;
|
||
if ($thisLine =~ />([^\n<>]*)</) {
|
||
$greekText .= $1 . " "
|
||
}
|
||
$thisLine =~ s/\t(<w .*)>([^<]*)(<\/w>)/$1 text="$2">$3/;
|
||
if ($thisLine =~ /lemma="(\d+)"/) {
|
||
my $thisLemma = $language . $1;
|
||
if (exists $entriesThisSN{$thisLemma}) {
|
||
$rsnCount ++;
|
||
$linesWithRelevantSNs .= $thisLine . "\n";
|
||
say LOG "<0.2.1>\t\$thisLemma: $thisLemma; line pushed to \$linesWithRelevantSNs";
|
||
}
|
||
else {
|
||
$skipCount ++;
|
||
$thisLine =~ s/><\/w>/>√<\/w>/;
|
||
$linesToSkip .= "$thisLine\n";
|
||
#say LOG "<0.2.2>\t\$thisLemma: $thisLemma; line pushed to \@LinesToSkip";
|
||
}
|
||
}
|
||
}
|
||
elsif ($thisLine =~ /<verse osisID="(.*?)\.(.*?)\.(.*?)">/) {
|
||
say LOG "<0.3>\t$thisLine";
|
||
($bk, $ch, $vs) = ($1,$2,$3);
|
||
($thisVerse, $greekText) = ("$longName{$bk} $ch:$vs", "");
|
||
$staticText = $ULBtextThisVerse{$thisVerse};
|
||
$residueText = "q $staticText q";
|
||
$thisPreText = $ULBpreTextThisVerse{$thisVerse};
|
||
say OUT "\t\t\t<verse name=\"$thisVerse\">";
|
||
($flag) = (1);
|
||
}
|
||
else {say OUT $thisLine}
|
||
}
|
||
|
||
close $thisFile;
|
||
close OUT;
|
||
}
|
||
}
|
||
sub RestoreGreekOrder {
|
||
my ($matchedLines, $linesNotMatched, $linesToSkip) = (@_);
|
||
say LOG "<00>\t\$matchedLines: $matchedLines \$linesNotMatched: $linesNotMatched \$linesToSkip: $linesToSkip ";
|
||
my $wordsLine;
|
||
my %order;
|
||
foreach my $thisOne ($matchedLines, $linesNotMatched, $linesToSkip) {
|
||
say LOG "\$thisOne:\n$thisOne";
|
||
while ($thisOne =~ /(<w OGNTsort="([^"]*)"[^>]*>[^<]*<\/w>)/g) {
|
||
$order{$2} = "$1\n";
|
||
}
|
||
}
|
||
foreach my $line (sort keys %order) {
|
||
$wordsLine .= $order{$line}
|
||
}
|
||
say LOG "<01>\t\$wordsLine:>>\n$wordsLine<<";
|
||
return $wordsLine
|
||
}
|
||
sub ProcessRelevantSNs {
|
||
my ($relevantLines, $staticText, $residueText, $linesNotMatched) = (@_);
|
||
my ($matchedLines, $thisLine);
|
||
my @relevantLines = split /\n/, $relevantLines;
|
||
foreach my $line (@relevantLines) {
|
||
if ($line =~ /lemma="(\d+)"/) {
|
||
my $thisSN = $language . $1;
|
||
say LOG "<12>\t\$line: $line, \$thisSN: $thisSN, \$entriesThisSN{$thisSN}\n$entriesThisSN{$thisSN}";
|
||
($thisLine, $residueText, $linesNotMatched) = MatchAndPlace($line, $thisSN, $staticText, $residueText, $linesNotMatched);
|
||
$thisLine =~ s/[ \t]+$//;
|
||
$matchedLines .= $thisLine . "\n";
|
||
$matchedLines =~ s/\n{2,}$/\n/s;
|
||
say LOG "<13>\t\$matchedLines\n$matchedLines\n\$linesNotMatched\n$linesNotMatched+++"
|
||
}
|
||
}
|
||
return ($matchedLines, $residueText, $linesNotMatched);
|
||
}
|
||
sub MatchAndPlace {
|
||
my ($line, $sn, $staticText, $workText, $linesNotMatched) = @_;
|
||
#say LOG "<8>\t\$line: $line \$sn: $sn \$workText\n$workText";
|
||
my ($workEntry, $found, $matchedLines, $first, $second, $third, $firstLen, $secondLen, $thirdLen);
|
||
my @entries = split /, /, $entriesThisSN{$sn};
|
||
foreach my $entry (@entries) {
|
||
my $entryType;
|
||
if ($entry =~ /^(.*) \.\.\. (.*) \.\.\. (.*)$/) {
|
||
($first, $second, $third) = ($1, $2, $3);
|
||
($firstLen, $secondLen, $thirdLen) = (length $first, length $second, length $third);
|
||
$workEntry = "\\b" . $first . "\\b" . ".*?" . "\\b" . $second . "\\b" . ".*?" . "\\b" . $third;
|
||
say LOG "<1a>\t\$first: $first, \$second: $second, \$third: $third, \$firstLen: $firstLen, \$secondLen,: $secondLen, \$thirdLen: $thirdLen \$entry: |$entry|\t\$workEntry: |$workEntry|";
|
||
$entryType = 1;
|
||
}
|
||
elsif ($entry =~ /^(.*) \.\.\. (.*)$/) {
|
||
($first, $second) = ($1, $2);
|
||
($firstLen, $secondLen) = (length $first, length $second);
|
||
$workEntry = "\\b" . $first . "\\b" . ".*?" . "\\b" . $second . "\\b";
|
||
say LOG "<2a>\t\$first: $first, \$second: $second, \$third: $third, \$firstLen: $firstLen, \$secondLen,: $secondLen, \$entry: |$entry|\t\$workEntry: |$workEntry|";
|
||
$entryType = 2;
|
||
}
|
||
else {$workEntry = $entry;}
|
||
|
||
my $foundText;
|
||
#say LOG "<8.1>\t\$entryType: $entryType\t\$entry: $entry\t\$workEntry: $workEntry";
|
||
if ($workText =~ /\b$workEntry\b/p) {
|
||
say LOG "<8.1>Found: \t\$entryType: $entryType\t\$entry: $entry\t\$workEntry: $workEntry";
|
||
($foundText, $workText) = ($&, "${^PREMATCH}ı${^POSTMATCH}");
|
||
my ($place, $foundTextLength, $replacementSpaces) = (length ${^PREMATCH}, length $foundText, "");
|
||
$line =~ s/></>$entry</;
|
||
while (length $replacementSpaces < $foundTextLength) {$replacementSpaces .= " "}
|
||
|
||
if ($entryType) {
|
||
|
||
say LOG "<8.2>\n\$workText,: $workText, \$matchedLines:\n$matchedLines ";
|
||
|
||
($workText) = FixWorkText($line, $workText, $workEntry, $foundText, $foundTextLength, $first, $firstLen, $second, $secondLen, $third, $thirdLen);
|
||
|
||
say LOG "<8.3>\n\$workText:\n$workText\n\$matchedLines:\n$matchedLines";
|
||
|
||
}
|
||
|
||
else {$workText =~ s/ı/$replacementSpaces/;}
|
||
|
||
$matchedLines .= "$line◊$place";
|
||
|
||
say LOG "<8.4>\tAfter found, new \$workText:\n$workText";
|
||
$found = 1;
|
||
}
|
||
else {
|
||
#say LOG "\$workEntry $workEntry not found"
|
||
}
|
||
if ($found) {
|
||
last
|
||
}
|
||
}
|
||
unless ($found) {
|
||
$line =~ s/></>√</;
|
||
$linesNotMatched .= "$line\n"
|
||
}
|
||
return ($matchedLines, $workText, $linesNotMatched)
|
||
}
|
||
sub FixWorkText {
|
||
my ($thisLine, $text, $entry, $foundText, $foundTextLength, $first, $firstLen, $second, $secondLen, $third, $thirdLen) = @_;
|
||
my ($firstSpace, $secondSpace, $thirdSpace);
|
||
while (length $firstSpace < $firstLen) {$firstSpace .= " "}
|
||
while (length $secondSpace < $secondLen) {$secondSpace .= " "}
|
||
while (length $thirdSpace < $thirdLen) {$thirdSpace .= " "}
|
||
|
||
say LOG "<9>\$text:\n$text\n\t\t\$entry: $entry \$foundText: $foundText\t \$foundTextLength: $foundTextLength\t\$first: $first\t\$second: $second\t\$third: $third\n\$firstSpace: $firstSpace\t\$secondSpace: $secondSpace\t\$thirdSpace: $thirdSpace";
|
||
if ($third) {
|
||
if ($foundText =~ /$first(.*)$second(.*)$third/) {
|
||
my ($firstGap, $secondGap) = ($1, $2);
|
||
my $repText = "$firstSpace$firstGap$secondSpace$secondGap$thirdSpace";
|
||
say LOG "<9.1> \$repText: $repText";
|
||
$text =~ s/ı/$repText/;
|
||
}
|
||
}
|
||
else {
|
||
if ($foundText =~ /$first(.*)$second/) {
|
||
my ($firstGap) = ($1);
|
||
say LOG "<9.2>\t\$firstSpace: |$firstSpace|\t\$firstGap: |$firstGap|\t\$secondSpace: |$secondSpace|";
|
||
my $repText ="$firstSpace$firstGap$secondSpace";
|
||
say LOG "<9.3> \$repText: |$repText|";
|
||
$text =~ s/ı/$repText/;
|
||
}
|
||
}
|
||
return ($text)
|
||
}
|
||
sub Prepare_ULB_file {
|
||
|
||
my $thisVerse;
|
||
#$ULBText = read_file($ULBfile, binmode => 'utf8');
|
||
|
||
#while ($ULBText =~ /<verse name="(.*?)">\n<preText>(.*?)<\/preText>\n.*<text>(.*?)<\/text>.*<\/verse>/sg) {
|
||
# ($ULBtextThisVerse{$1}, $ULBpreTextThisVerse{$1}) = ($3, $2);
|
||
#}
|
||
#
|
||
|
||
open (my $file, "<:utf8", "$ULBfile") or die "$ULBfile:\n$!";
|
||
|
||
while (my $thisLine = <$file>) {
|
||
chomp $thisLine;
|
||
if ($thisLine =~ /verse name="(.*?)"/) {
|
||
$thisVerse = $1;
|
||
#say LOG "$thisVerse:\n$thisLine"
|
||
}
|
||
elsif ($thisLine =~ /<preText>(.*?)<\/preText>/) {
|
||
$ULBpreTextThisVerse{$thisVerse} = $1;
|
||
#say LOG "$thisVerse:\n$ULBpreTextThisVerse{$thisVerse}"
|
||
}
|
||
elsif ($thisLine =~ /<text>(.*?)<\/text>/) {
|
||
$ULBtextThisVerse{$thisVerse} = $1;
|
||
#say LOG "$thisVerse:\n$ULBtextThisVerse{$thisVerse}"
|
||
}
|
||
}
|
||
|
||
close $file;
|
||
}
|
||
|
||
__DATA__
|
||
01 gen Genesis
|
||
02 exo Exodus
|
||
03 lev Leviticus
|
||
04 num Numbers
|
||
05 deu Deuteronomy
|
||
06 jos Joshua
|
||
07 jdg Judges
|
||
08 rut Ruth
|
||
09 1sa 1 Samuel
|
||
10 2sa 2 Samuel
|
||
11 1ki 1 Kings
|
||
12 2ki 2 Kings
|
||
13 1ch 1 Chronicles
|
||
14 2ch 2 Chronicles
|
||
15 ezr Ezra
|
||
16 neh Nehemiah
|
||
17 est Esther
|
||
18 job Job
|
||
19 psa Psalms
|
||
20 pro Proverbs
|
||
21 ecc Ecclesiastes
|
||
22 sng Song of Solomon
|
||
23 isa Isaiah
|
||
24 jer Jeremiah
|
||
25 lam Lamentations
|
||
26 ezk Ezekiel
|
||
27 dan Daniel
|
||
28 hos Hosea
|
||
29 jol Joel
|
||
30 amo Amos
|
||
31 oba Obadiah
|
||
32 jon Jonah
|
||
33 mic Micah
|
||
34 nam Nahum
|
||
35 hab Habakkuk
|
||
36 zep Zephaniah
|
||
37 hag Haggai
|
||
38 zec Zechariah
|
||
39 mal Malachi
|
||
41 mat Matthew
|
||
42 mrk Mark
|
||
43 luk Luke
|
||
44 jhn John
|
||
45 act Acts
|
||
46 rom Romans
|
||
47 1co 1 Corinthians
|
||
48 2co 2 Corinthians
|
||
49 gal Galatians
|
||
50 eph Ephesians
|
||
51 php Philippians
|
||
52 col Colossians
|
||
53 1th 1 Thessalonians
|
||
54 2th 2 Thessalonians
|
||
55 1ti 1 Timothy
|
||
56 2ti 2 Timothy
|
||
57 tit Titus
|
||
58 phm Philemon
|
||
59 heb Hebrews
|
||
60 jas James
|
||
61 1pe 1 Peter
|
||
62 2pe 2 Peter
|
||
63 1jn 1 John
|
||
64 2jn 2 John
|
||
65 3jn 3 John
|
||
66 jud Jude
|
||
67 rev Revelation
|