forked from WycliffeAssociates/en_ulb_tagged
Tagging ULB
This commit is contained in:
parent
4985c2e6e0
commit
5851b15017
|
@ -0,0 +1,435 @@
|
|||
# Takes current tW entries and populates tagged OGNT XML
|
||||
#
|
||||
# This is the current best version
|
||||
# Requires ULB that includes USFMs.
|
||||
|
||||
# This version maintains the Greek word order to lessen reordering.
|
||||
|
||||
|
||||
use 5.12.0;
|
||||
use File::Slurp;
|
||||
use File::Find ;
|
||||
use Cwd ;
|
||||
use utf8;
|
||||
#use open IN => ":utf8", OUT => ":utf8";
|
||||
use open IO => ":utf8";
|
||||
$" = "\n";
|
||||
|
||||
mkdir "Logs";
|
||||
open(LOG, ">Logs/tW_pairs.txt") or die "$!";
|
||||
my $ULBfile = "/Users/Henry/Documents/WACS/en_ulb_tagged/ULB_xml/ULB.xml";
|
||||
my $topDirOGNT = "/Users/Henry/Documents/WACS/OGNT";
|
||||
#my $topDirOGNT = "/Users/Henry/Documents/WACS/en_ulb_tagged/Tag_test";
|
||||
my $topDirtW = "/Users/Henry/Documents/WACS/en_tw/bible";
|
||||
my ($outDir, $outFile) = ("/Users/Henry/Documents/WACS/en_ulb_tagged/Auto-tagged", "");
|
||||
my ($ULBText, $workText, $language);
|
||||
my ($file);
|
||||
my (%ULBtextThisVerse, %ULBpreTextThisVerse, %SNsThisVerse, %entriesThisSN, %longName);
|
||||
|
||||
my @OGNTfilesToRun = ();
|
||||
#my $filePattern = '\.xml' ;
|
||||
my $filePattern = '41-MAT\.xml' ;
|
||||
find( sub { push @OGNTfilesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDirOGNT) ;
|
||||
|
||||
say LOG "\@OGNTfilesToRun:\n@OGNTfilesToRun\n";
|
||||
|
||||
my @tWfilesToRun = ();
|
||||
$filePattern = '.md' ;
|
||||
find( sub { push @tWfilesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDirtW) ;
|
||||
|
||||
Read_tW_Files();
|
||||
|
||||
close LOG;
|
||||
open(LOG, ">Logs/Log.txt") or die "$!";
|
||||
|
||||
LongBookNames();
|
||||
Prepare_ULB_file();
|
||||
say LOG "Prepare_ULB_file done.\n\@OGNTfilesToRun:\n@OGNTfilesToRun\n";
|
||||
|
||||
ProcessXML();
|
||||
# put unused SN at end of verse
|
||||
|
||||
close LOG;
|
||||
|
||||
say "\nDone.";
|
||||
# =====
|
||||
sub Read_tW_Files {
|
||||
foreach $file ( @tWfilesToRun ) {
|
||||
say LOG $file;
|
||||
my (@sns);
|
||||
my $entries;
|
||||
my $fileText = read_file("$file", binmode => 'utf8');
|
||||
if ($fileText =~ /\* Strong's: ([^\n]*)\n/) {
|
||||
my $sns = $1;
|
||||
#say LOG "\t$sns";
|
||||
@sns = split /, /, $sns;
|
||||
}
|
||||
if ($fileText =~ /Forms Found in the English ULB:\n\n([^\n]*)\n/) {
|
||||
$entries = $1;
|
||||
die "$fileText" if $entries eq "";
|
||||
#say LOG "\t\t$entries"
|
||||
}
|
||||
foreach my $sn (@sns) {
|
||||
$entriesThisSN{$sn} .= $entries . ", ";
|
||||
#say LOG "\t\t\t$sn: $entriesThisSN{$sn}"
|
||||
}
|
||||
}
|
||||
foreach my $sn (sort keys %entriesThisSN) {
|
||||
#say LOG "$sn: $entriesThisSN{$sn}";
|
||||
my @entries = split /, /, $entriesThisSN{$sn};
|
||||
@entries = reverse sort { substr($a,0,1) <=> substr($b,0,1)
|
||||
|| length($a) <=> length($b)
|
||||
|| $a <=> $b }
|
||||
@entries;
|
||||
$entriesThisSN{$sn} = "";
|
||||
foreach my $slice (@entries) {
|
||||
$entriesThisSN{$sn} .= "$slice, "
|
||||
}
|
||||
$entriesThisSN{$sn} =~ s/, $//;
|
||||
say LOG "$sn: $entriesThisSN{$sn}";
|
||||
}
|
||||
|
||||
}
|
||||
sub LongBookNames {
|
||||
while (<DATA>) {
|
||||
chomp;
|
||||
if (/([^\t]*)\t([^\t]*)\t([^\t]*)/) {
|
||||
$longName{$2} = $3
|
||||
}
|
||||
}
|
||||
}
|
||||
sub ProcessXML {
|
||||
foreach my $file (@OGNTfilesToRun) {
|
||||
my $greekText;
|
||||
my $fileGist;
|
||||
if ($file =~ /((..)....\.xml)/) {
|
||||
($fileGist, $language) = ($1, $2);
|
||||
if ($language > 40) {
|
||||
$language = "G"
|
||||
} else {$language = "H"}
|
||||
|
||||
}
|
||||
say LOG "<0>\t$file \t$fileGist";
|
||||
open(OUT, ">$outDir/$fileGist") or die "$outDir/$fileGist: $!";
|
||||
my ($pre, $gist, $post, $bk, $ch, $vs, $thisVerse, $staticText, $residueText, $matchedLines, $flag, $thisVerseForOutput,
|
||||
$linesWithRelevantSNs, $linesNotMatched, $orderedOutputLines, $linesToSkip, $thisPreText);
|
||||
open (my $thisFile, "<:utf8", "$file") or die "$file:\n$!";
|
||||
my ($originalLinesCount, $rsnCount, $skipCount, $noRSNCount, $outCount);
|
||||
while (my $thisLine = <$thisFile>) {
|
||||
chomp $thisLine;
|
||||
if ($thisLine =~ /<verse/) {say LOG "\n=========================="}
|
||||
if ($thisLine =~ /<\/verse>/) {
|
||||
say LOG "<0.1>\t$thisLine";
|
||||
say LOG "<11>\n\$linesWithRelevantSNs\n$linesWithRelevantSNs\n\$linesToSkip\n$linesToSkip\$residueText\n$residueText";
|
||||
|
||||
($matchedLines, $residueText, $linesNotMatched) = ProcessRelevantSNs($linesWithRelevantSNs, $staticText, $residueText);
|
||||
say LOG "<14>\t\$matchedLines\n$matchedLines\n\$linesNotMatched\n$linesNotMatched";
|
||||
my %orderedLine;
|
||||
$matchedLines =~ s/\n{2,}/\n/gs;
|
||||
say LOG "<15\tBefore sort of \$matchedLines:\n$matchedLines\n";
|
||||
while ($matchedLines =~ /([^◊]*)◊(\d*)\n/g) {
|
||||
$orderedLine{$2} = $1;
|
||||
say LOG "<5>\t\$2: $2\t\$1: $1";
|
||||
}
|
||||
$matchedLines = "";
|
||||
foreach my $line (sort {$a <=> $b} keys %orderedLine) {
|
||||
say LOG "<5.5>\t\$line: $line\t\$orderedLine{$line}: $orderedLine{$line}";
|
||||
$matchedLines .= "$orderedLine{$line}\n"
|
||||
}
|
||||
chomp $matchedLines;
|
||||
say LOG "<16>\tAfter sort of \$matchedLines:\n$matchedLines\n\$linesNotMatched\n$linesNotMatched";
|
||||
$residueText =~ s/<usfm>.*?<\/usfm>//g;
|
||||
$residueText =~ s/(^q | q$)//g;
|
||||
$residueText =~ s/ {3,}/ /g;
|
||||
$residueText =~ s/^ +//;
|
||||
$residueText =~ s/ +$/$1/;
|
||||
$greekText =~ s/^ +//;
|
||||
$greekText =~ s/ +$/$1/;
|
||||
$staticText =~ s/^ +//;
|
||||
$staticText =~ s/ +$/$1/;
|
||||
my $internalUSFM;
|
||||
$internalUSFM .= "\t\t\t\t\t$&\n" while ($staticText =~ /<usfm>.*?<\/usfm>/g);
|
||||
$linesNotMatched =~ s/\n+$//;
|
||||
$linesToSkip =~ s/\n+$//;
|
||||
$matchedLines =~ s/^\n+//;
|
||||
$internalUSFM =~ s/\n+$//;
|
||||
$internalUSFM =~ s/\t{5,}/\t\t\t\t/g;
|
||||
say LOG "<17>\tAfter pruning \$matchedLines:\n$matchedLines\n\$linesNotMatched\n$linesNotMatched";
|
||||
say OUT "\t\t\t\t<Greek>$greekText</Greek>";
|
||||
say OUT "\t\t\t\t<preText>$thisPreText</preText>";
|
||||
say OUT "\t\t\t\t<ULB>$staticText</ULB>";
|
||||
say OUT "\t\t\t\t<residue>$residueText</residue>";
|
||||
say LOG "\$matchedLines:\n$matchedLines \$linesNotMatched:\n$linesNotMatched \$linesToSkip:\n$linesToSkip ";
|
||||
my $wordLines = RestoreGreekOrder($matchedLines, $linesNotMatched, $linesToSkip);
|
||||
#say OUT "$matchedLines" unless ($matchedLines eq "");
|
||||
#say OUT "$linesNotMatched" unless ($linesNotMatched eq "");
|
||||
#say OUT "$internalUSFM" unless ($internalUSFM eq "");
|
||||
#say OUT "$linesToSkip" if ($linesToSkip);
|
||||
$wordLines =~ s/<w/\t\t\t\t<w/gs;
|
||||
$thisLine = "$wordLines\n$internalUSFM\n$thisLine";
|
||||
$thisLine =~ s/\n{2,}/\n/sg;
|
||||
say OUT "$thisLine";
|
||||
($originalLinesCount, $rsnCount, $skipCount, $noRSNCount, $outCount) = ();
|
||||
($thisVerseForOutput, $flag, $workText, $greekText, $linesNotMatched, $linesToSkip, $residueText, $orderedOutputLines, $linesWithRelevantSNs) = ();
|
||||
($linesToSkip) = ("");
|
||||
}
|
||||
elsif ($thisLine =~ /<w /) {
|
||||
say LOG "<0.2>\t$thisLine";
|
||||
$originalLinesCount ++;
|
||||
if ($thisLine =~ />([^\n<>]*)</) {
|
||||
$greekText .= $1 . " "
|
||||
}
|
||||
$thisLine =~ s/\t(<w .*)>([^<]*)(<\/w>)/$1 text="$2">$3/;
|
||||
if ($thisLine =~ /lemma="(\d+)"/) {
|
||||
my $thisLemma = $language . $1;
|
||||
if (exists $entriesThisSN{$thisLemma}) {
|
||||
$rsnCount ++;
|
||||
$linesWithRelevantSNs .= $thisLine . "\n";
|
||||
say LOG "<0.2.1>\t\$thisLemma: $thisLemma; line pushed to \$linesWithRelevantSNs";
|
||||
}
|
||||
else {
|
||||
$skipCount ++;
|
||||
$thisLine =~ s/><\/w>/>√<\/w>/;
|
||||
$linesToSkip .= "$thisLine\n";
|
||||
#say LOG "<0.2.2>\t\$thisLemma: $thisLemma; line pushed to \@LinesToSkip";
|
||||
}
|
||||
}
|
||||
}
|
||||
elsif ($thisLine =~ /<verse osisID="(.*?)\.(.*?)\.(.*?)">/) {
|
||||
say LOG "<0.3>\t$thisLine";
|
||||
($bk, $ch, $vs) = ($1,$2,$3);
|
||||
($thisVerse, $greekText) = ("$longName{$bk} $ch:$vs", "");
|
||||
$staticText = $ULBtextThisVerse{$thisVerse};
|
||||
$residueText = "q $staticText q";
|
||||
$thisPreText = $ULBpreTextThisVerse{$thisVerse};
|
||||
say OUT "\t\t\t<verse name=\"$thisVerse\">";
|
||||
($flag) = (1);
|
||||
}
|
||||
else {say OUT $thisLine}
|
||||
}
|
||||
|
||||
close $thisFile;
|
||||
close OUT;
|
||||
}
|
||||
}
|
||||
sub RestoreGreekOrder {
|
||||
my ($matchedLines, $linesNotMatched, $linesToSkip) = (@_);
|
||||
say LOG "<00>\t\$matchedLines: $matchedLines \$linesNotMatched: $linesNotMatched \$linesToSkip: $linesToSkip ";
|
||||
my $wordsLine;
|
||||
my %order;
|
||||
foreach my $thisOne ($matchedLines, $linesNotMatched, $linesToSkip) {
|
||||
say LOG "\$thisOne:\n$thisOne";
|
||||
while ($thisOne =~ /(<w OGNTsort="([^"]*)"[^>]*>[^<]*<\/w>)/g) {
|
||||
$order{$2} = "$1\n";
|
||||
}
|
||||
}
|
||||
foreach my $line (sort keys %order) {
|
||||
$wordsLine .= $order{$line}
|
||||
}
|
||||
say LOG "<01>\t\$wordsLine:>>\n$wordsLine<<";
|
||||
return $wordsLine
|
||||
}
|
||||
sub ProcessRelevantSNs {
|
||||
my ($relevantLines, $staticText, $residueText, $linesNotMatched) = (@_);
|
||||
my ($matchedLines, $thisLine);
|
||||
my @relevantLines = split /\n/, $relevantLines;
|
||||
foreach my $line (@relevantLines) {
|
||||
if ($line =~ /lemma="(\d+)"/) {
|
||||
my $thisSN = $language . $1;
|
||||
say LOG "<12>\t\$line: $line, \$thisSN: $thisSN, \$entriesThisSN{$thisSN}\n$entriesThisSN{$thisSN}";
|
||||
($thisLine, $residueText, $linesNotMatched) = MatchAndPlace($line, $thisSN, $staticText, $residueText, $linesNotMatched);
|
||||
$thisLine =~ s/[ \t]+$//;
|
||||
$matchedLines .= $thisLine . "\n";
|
||||
$matchedLines =~ s/\n{2,}$/\n/s;
|
||||
say LOG "<13>\t\$matchedLines\n$matchedLines\n\$linesNotMatched\n$linesNotMatched+++"
|
||||
}
|
||||
}
|
||||
return ($matchedLines, $residueText, $linesNotMatched);
|
||||
}
|
||||
sub MatchAndPlace {
|
||||
my ($line, $sn, $staticText, $workText, $linesNotMatched) = @_;
|
||||
#say LOG "<8>\t\$line: $line \$sn: $sn \$workText\n$workText";
|
||||
my ($workEntry, $found, $matchedLines, $first, $second, $third, $firstLen, $secondLen, $thirdLen);
|
||||
my @entries = split /, /, $entriesThisSN{$sn};
|
||||
foreach my $entry (@entries) {
|
||||
my $entryType;
|
||||
if ($entry =~ /^(.*) \.\.\. (.*) \.\.\. (.*)$/) {
|
||||
($first, $second, $third) = ($1, $2, $3);
|
||||
($firstLen, $secondLen, $thirdLen) = (length $first, length $second, length $third);
|
||||
$workEntry = "\\b" . $first . "\\b" . ".*?" . "\\b" . $second . "\\b" . ".*?" . "\\b" . $third;
|
||||
say LOG "<1a>\t\$first: $first, \$second: $second, \$third: $third, \$firstLen: $firstLen, \$secondLen,: $secondLen, \$thirdLen: $thirdLen \$entry: |$entry|\t\$workEntry: |$workEntry|";
|
||||
$entryType = 1;
|
||||
}
|
||||
elsif ($entry =~ /^(.*) \.\.\. (.*)$/) {
|
||||
($first, $second) = ($1, $2);
|
||||
($firstLen, $secondLen) = (length $first, length $second);
|
||||
$workEntry = "\\b" . $first . "\\b" . ".*?" . "\\b" . $second . "\\b";
|
||||
say LOG "<2a>\t\$first: $first, \$second: $second, \$third: $third, \$firstLen: $firstLen, \$secondLen,: $secondLen, \$entry: |$entry|\t\$workEntry: |$workEntry|";
|
||||
$entryType = 2;
|
||||
}
|
||||
else {$workEntry = $entry;}
|
||||
|
||||
my $foundText;
|
||||
#say LOG "<8.1>\t\$entryType: $entryType\t\$entry: $entry\t\$workEntry: $workEntry";
|
||||
if ($workText =~ /\b$workEntry\b/p) {
|
||||
say LOG "<8.1>Found: \t\$entryType: $entryType\t\$entry: $entry\t\$workEntry: $workEntry";
|
||||
($foundText, $workText) = ($&, "${^PREMATCH}ı${^POSTMATCH}");
|
||||
my ($place, $foundTextLength, $replacementSpaces) = (length ${^PREMATCH}, length $foundText, "");
|
||||
$line =~ s/></>$entry</;
|
||||
while (length $replacementSpaces < $foundTextLength) {$replacementSpaces .= " "}
|
||||
|
||||
if ($entryType) {
|
||||
|
||||
say LOG "<8.2>\n\$workText,: $workText, \$matchedLines:\n$matchedLines ";
|
||||
|
||||
($workText) = FixWorkText($line, $workText, $workEntry, $foundText, $foundTextLength, $first, $firstLen, $second, $secondLen, $third, $thirdLen);
|
||||
|
||||
say LOG "<8.3>\n\$workText:\n$workText\n\$matchedLines:\n$matchedLines";
|
||||
|
||||
}
|
||||
|
||||
else {$workText =~ s/ı/$replacementSpaces/;}
|
||||
|
||||
$matchedLines .= "$line◊$place";
|
||||
|
||||
say LOG "<8.4>\tAfter found, new \$workText:\n$workText";
|
||||
$found = 1;
|
||||
}
|
||||
else {
|
||||
#say LOG "\$workEntry $workEntry not found"
|
||||
}
|
||||
if ($found) {
|
||||
last
|
||||
}
|
||||
}
|
||||
unless ($found) {
|
||||
$line =~ s/></>?</;
|
||||
$linesNotMatched .= "$line\n"
|
||||
}
|
||||
return ($matchedLines, $workText, $linesNotMatched)
|
||||
}
|
||||
sub FixWorkText {
|
||||
my ($thisLine, $text, $entry, $foundText, $foundTextLength, $first, $firstLen, $second, $secondLen, $third, $thirdLen) = @_;
|
||||
my ($firstSpace, $secondSpace, $thirdSpace);
|
||||
while (length $firstSpace < $firstLen) {$firstSpace .= " "}
|
||||
while (length $secondSpace < $secondLen) {$secondSpace .= " "}
|
||||
while (length $thirdSpace < $thirdLen) {$thirdSpace .= " "}
|
||||
|
||||
say LOG "<9>\$text:\n$text\n\t\t\$entry: $entry \$foundText: $foundText\t \$foundTextLength: $foundTextLength\t\$first: $first\t\$second: $second\t\$third: $third\n\$firstSpace: $firstSpace\t\$secondSpace: $secondSpace\t\$thirdSpace: $thirdSpace";
|
||||
if ($third) {
|
||||
if ($foundText =~ /$first(.*)$second(.*)$third/) {
|
||||
my ($firstGap, $secondGap) = ($1, $2);
|
||||
my $repText = "$firstSpace$firstGap$secondSpace$secondGap$thirdSpace";
|
||||
say LOG "<9.1> \$repText: $repText";
|
||||
$text =~ s/ı/$repText/;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if ($foundText =~ /$first(.*)$second/) {
|
||||
my ($firstGap) = ($1);
|
||||
say LOG "<9.2>\t\$firstSpace: |$firstSpace|\t\$firstGap: |$firstGap|\t\$secondSpace: |$secondSpace|";
|
||||
my $repText ="$firstSpace$firstGap$secondSpace";
|
||||
say LOG "<9.3> \$repText: |$repText|";
|
||||
$text =~ s/ı/$repText/;
|
||||
}
|
||||
}
|
||||
return ($text)
|
||||
}
|
||||
sub Prepare_ULB_file {
|
||||
|
||||
my $thisVerse;
|
||||
#$ULBText = read_file($ULBfile, binmode => 'utf8');
|
||||
|
||||
#while ($ULBText =~ /<verse name="(.*?)">\n<preText>(.*?)<\/preText>\n.*<text>(.*?)<\/text>.*<\/verse>/sg) {
|
||||
# ($ULBtextThisVerse{$1}, $ULBpreTextThisVerse{$1}) = ($3, $2);
|
||||
#}
|
||||
#
|
||||
|
||||
open (my $file, "<:utf8", "$ULBfile") or die "$ULBfile:\n$!";
|
||||
|
||||
while (my $thisLine = <$file>) {
|
||||
chomp $thisLine;
|
||||
if ($thisLine =~ /verse name="(.*?)"/) {
|
||||
$thisVerse = $1;
|
||||
#say LOG "$thisVerse:\n$thisLine"
|
||||
}
|
||||
elsif ($thisLine =~ /<preText>(.*?)<\/preText>/) {
|
||||
$ULBpreTextThisVerse{$thisVerse} = $1;
|
||||
#say LOG "$thisVerse:\n$ULBpreTextThisVerse{$thisVerse}"
|
||||
}
|
||||
elsif ($thisLine =~ /<text>(.*?)<\/text>/) {
|
||||
$ULBtextThisVerse{$thisVerse} = $1;
|
||||
#say LOG "$thisVerse:\n$ULBtextThisVerse{$thisVerse}"
|
||||
}
|
||||
}
|
||||
|
||||
close $file;
|
||||
}
|
||||
|
||||
__DATA__
|
||||
01 gen Genesis
|
||||
02 exo Exodus
|
||||
03 lev Leviticus
|
||||
04 num Numbers
|
||||
05 deu Deuteronomy
|
||||
06 jos Joshua
|
||||
07 jdg Judges
|
||||
08 rut Ruth
|
||||
09 1sa 1 Samuel
|
||||
10 2sa 2 Samuel
|
||||
11 1ki 1 Kings
|
||||
12 2ki 2 Kings
|
||||
13 1ch 1 Chronicles
|
||||
14 2ch 2 Chronicles
|
||||
15 ezr Ezra
|
||||
16 neh Nehemiah
|
||||
17 est Esther
|
||||
18 job Job
|
||||
19 psa Psalms
|
||||
20 pro Proverbs
|
||||
21 ecc Ecclesiastes
|
||||
22 sng Song of Solomon
|
||||
23 isa Isaiah
|
||||
24 jer Jeremiah
|
||||
25 lam Lamentations
|
||||
26 ezk Ezekiel
|
||||
27 dan Daniel
|
||||
28 hos Hosea
|
||||
29 jol Joel
|
||||
30 amo Amos
|
||||
31 oba Obadiah
|
||||
32 jon Jonah
|
||||
33 mic Micah
|
||||
34 nam Nahum
|
||||
35 hab Habakkuk
|
||||
36 zep Zephaniah
|
||||
37 hag Haggai
|
||||
38 zec Zechariah
|
||||
39 mal Malachi
|
||||
41 mat Matthew
|
||||
42 mrk Mark
|
||||
43 luk Luke
|
||||
44 jhn John
|
||||
45 act Acts
|
||||
46 rom Romans
|
||||
47 1co 1 Corinthians
|
||||
48 2co 2 Corinthians
|
||||
49 gal Galatians
|
||||
50 eph Ephesians
|
||||
51 php Philippians
|
||||
52 col Colossians
|
||||
53 1th 1 Thessalonians
|
||||
54 2th 2 Thessalonians
|
||||
55 1ti 1 Timothy
|
||||
56 2ti 2 Timothy
|
||||
57 tit Titus
|
||||
58 phm Philemon
|
||||
59 heb Hebrews
|
||||
60 jas James
|
||||
61 1pe 1 Peter
|
||||
62 2pe 2 Peter
|
||||
63 1jn 1 John
|
||||
64 2jn 2 John
|
||||
65 3jn 3 John
|
||||
66 jud Jude
|
||||
67 rev Revelation
|
Loading…
Reference in New Issue