forked from WycliffeAssociates/en_ulb_tagged
Now needed for tagging
This commit is contained in:
parent
13b8c8c948
commit
91d789da65
|
@ -0,0 +1,180 @@
|
||||||
|
# Builds easily searchable file from current OGNT and MAST-HB XML file
|
||||||
|
# Takes verse at a time from slurped file
|
||||||
|
# Useful for Mine routine building MAST PDF
|
||||||
|
use 5.18.0;
|
||||||
|
use File::Slurp;
|
||||||
|
use File::Find ;
|
||||||
|
use Cwd ;
|
||||||
|
use utf8;
|
||||||
|
#use open IN => ":utf8", OUT => ":utf8";
|
||||||
|
use open IO => ":utf8";
|
||||||
|
open LOG, ">:utf8", "Logs/log.txt" or die;
|
||||||
|
open OUT, ">:utf8", "Output/Original_languages.txt" or die;
|
||||||
|
|
||||||
|
my (@folders) = ("/Users/Henry/Documents/WACS/MAST_HB", "/Users/Henry/Documents/WACS/OGNT");
|
||||||
|
my (%order, %long);
|
||||||
|
my $outText;
|
||||||
|
|
||||||
|
while (<DATA>) {
|
||||||
|
chomp;
|
||||||
|
if (/^([^\t]*)\t([^\t]*)\t(.*)$/) {
|
||||||
|
$order{$1} = $3;
|
||||||
|
$long{$2} = $3;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#foreach my $key (sort keys %long) {
|
||||||
|
# say LOG $key . "\t" . $long{$key};
|
||||||
|
#}
|
||||||
|
|
||||||
|
foreach my $folder (@folders) {
|
||||||
|
say LOG "$folder";
|
||||||
|
#system "cd $folder;xml val *.xml;echo 'Continue? (Control + C to quit, Enter to continue)';read name;";
|
||||||
|
my ($topDir, $lang) = ($folder, "H");
|
||||||
|
|
||||||
|
if ($folder =~ /OGNT/) {$lang = "G"}
|
||||||
|
|
||||||
|
my @filesToRun = ();
|
||||||
|
my $filePattern = '*.xml' ;
|
||||||
|
find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ;
|
||||||
|
@filesToRun = sort @filesToRun;
|
||||||
|
foreach my $file ( @filesToRun ) {
|
||||||
|
say LOG $file;
|
||||||
|
my $fileText = read_file("$file", binmode => 'utf8');
|
||||||
|
my ($bk, $ch, $vs, $lemma, $word, $nbk, $nch, $nvs, $previous, $current, $interruption, $next, $verse, $thisBookText, $prevVsText, $holdText, $thisVsText, $nextVsText, $oldHold, $shortCur, $shortIntr);
|
||||||
|
while ($fileText =~ /<verse osisID="((.*?)\.(\d+)\.(\d+))"(\n|.)*?<\/verse>/spg) {
|
||||||
|
$verse = $&;
|
||||||
|
say LOG "\$1: $1, \$2: $2, \$3: $3, \$4: $4";
|
||||||
|
($shortCur, $bk, $ch, $vs) = ($1, $long{$2}, $3, $4);
|
||||||
|
$previous = $current;
|
||||||
|
$current = "$bk $ch:$vs";
|
||||||
|
say LOG "<0>\t\$current: $current";
|
||||||
|
my $verseText;
|
||||||
|
|
||||||
|
if ($verse =~ /<note>KJV:(([^\.]*)\.([^\.]*).([^<]*))<\/note>/p) { # Occurs only in OT
|
||||||
|
say LOG "<1>\t$&";
|
||||||
|
($shortIntr, $nbk, $nch, $nvs) = ($1, $long{$2}, $3, $4);
|
||||||
|
$interruption = "$nbk $nch:$nvs";
|
||||||
|
say LOG "<2>\t\$interruption: $interruption (of $current)";
|
||||||
|
if ($verse =~ /<verse osisID="$shortCur">\n[^<\n]*<note>KJV:$shortIntr<\/note>/) { # Complete renumber of verse
|
||||||
|
say LOG "<3>\t$&";
|
||||||
|
$current = $interruption;
|
||||||
|
$verseText = GetContent($verse);
|
||||||
|
$verseText = "$current\t$oldHold$verseText";
|
||||||
|
$oldHold = "";
|
||||||
|
}
|
||||||
|
elsif ($interruption ne $current && $verse =~ /<note>KJV:([^\.]*)\.([^\.]*).([^<]*)<\/note>/p) { # New verse begins here
|
||||||
|
say LOG "<4>\t$&";
|
||||||
|
($thisVsText, $nextVsText) = (${^PREMATCH}, ${^POSTMATCH});
|
||||||
|
$thisVsText = GetContent($thisVsText);
|
||||||
|
$nextVsText = GetContent($nextVsText);
|
||||||
|
$outText .= "$oldHold\n$current\t$thisVsText ";
|
||||||
|
$oldHold = "$nextVsText ";
|
||||||
|
}
|
||||||
|
elsif ($interruption eq $current && $verse =~ /<note>KJV:([^\.]*)\.([^\.]*).([^<]*)<\/note>/p) { # Previous verse continues here
|
||||||
|
say LOG "<5>\t$&";
|
||||||
|
($prevVsText, $thisVsText) = (${^PREMATCH}, ${^POSTMATCH});
|
||||||
|
$prevVsText = GetContent($prevVsText);
|
||||||
|
$thisVsText = GetContent($thisVsText);
|
||||||
|
$verseText .= "$oldHold\n$current\t$thisVsText";
|
||||||
|
$oldHold = "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
# The whole verse should be processed in one piece
|
||||||
|
#$verseText = GetContent($verse);
|
||||||
|
#$verseText = "$current\$tverseText"
|
||||||
|
}
|
||||||
|
#$thisBookText .= "\n$verseText";
|
||||||
|
#$oldHold = $holdText
|
||||||
|
}
|
||||||
|
#$thisBookText =~ s/</<$lang/g;
|
||||||
|
#$outText .= "$thisBookText\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
say OUT $outText;
|
||||||
|
|
||||||
|
close OUT;
|
||||||
|
close LOG;
|
||||||
|
|
||||||
|
print "\n\tDone.";
|
||||||
|
|
||||||
|
sub GetContent {
|
||||||
|
my ($text, $returnText) = ($_[0], "");
|
||||||
|
while ($text =~ /<w lemma="([^"]*)"[^>]*>([^<]*)<\/w>/) {
|
||||||
|
my ($lemma, $OL) = ($1, $2);
|
||||||
|
$lemma =~ s/[^\d"]*(\d+)[^\d"]*/$1/;
|
||||||
|
$returnText .= "$OL <$lemma> "
|
||||||
|
}
|
||||||
|
return $returnText
|
||||||
|
}
|
||||||
|
|
||||||
|
__DATA__
|
||||||
|
01 gen Genesis
|
||||||
|
02 exo Exodus
|
||||||
|
03 lev Leviticus
|
||||||
|
04 num Numbers
|
||||||
|
05 deu Deuteronomy
|
||||||
|
06 jos Joshua
|
||||||
|
07 jdg Judges
|
||||||
|
08 rut Ruth
|
||||||
|
09 1sa 1 Samuel
|
||||||
|
10 2sa 2 Samuel
|
||||||
|
11 1ki 1 Kings
|
||||||
|
12 2ki 2 Kings
|
||||||
|
13 1ch 1 Chronicles
|
||||||
|
14 2ch 2 Chronicles
|
||||||
|
15 ezr Ezra
|
||||||
|
16 neh Nehemiah
|
||||||
|
17 est Esther
|
||||||
|
18 job Job
|
||||||
|
19 psa Psalms
|
||||||
|
20 pro Proverbs
|
||||||
|
21 ecc Ecclesiastes
|
||||||
|
22 sng Song of Solomon
|
||||||
|
23 isa Isaiah
|
||||||
|
24 jer Jeremiah
|
||||||
|
25 lam Lamentations
|
||||||
|
26 ezk Ezekiel
|
||||||
|
27 dan Daniel
|
||||||
|
28 hos Hosea
|
||||||
|
29 jol Joel
|
||||||
|
30 amo Amos
|
||||||
|
31 oba Obadiah
|
||||||
|
32 jon Jonah
|
||||||
|
33 mic Micah
|
||||||
|
34 nam Nahum
|
||||||
|
35 hab Habakkuk
|
||||||
|
36 zep Zephaniah
|
||||||
|
37 hag Haggai
|
||||||
|
38 zec Zechariah
|
||||||
|
39 mal Malachi
|
||||||
|
41 mat Matthew
|
||||||
|
42 mrk Mark
|
||||||
|
43 luk Luke
|
||||||
|
44 jhn John
|
||||||
|
45 act Acts
|
||||||
|
46 rom Romans
|
||||||
|
47 1co 1 Corinthians
|
||||||
|
48 2co 2 Corinthians
|
||||||
|
49 gal Galatians
|
||||||
|
50 eph Ephesians
|
||||||
|
51 php Philippians
|
||||||
|
52 col Colossians
|
||||||
|
53 1th 1 Thessalonians
|
||||||
|
54 2th 2 Thessalonians
|
||||||
|
55 1ti 1 Timothy
|
||||||
|
56 2ti 2 Timothy
|
||||||
|
57 tit Titus
|
||||||
|
58 phm Philemon
|
||||||
|
59 heb Hebrews
|
||||||
|
60 jas James
|
||||||
|
61 1pe 1 Peter
|
||||||
|
62 2pe 2 Peter
|
||||||
|
63 1jn 1 John
|
||||||
|
64 2jn 2 John
|
||||||
|
65 3jn 3 John
|
||||||
|
66 jud Jude
|
||||||
|
67 rev Revelation
|
|
@ -0,0 +1,126 @@
|
||||||
|
# Creates workable ULB.xml file that has all USFM markers in place.
|
||||||
|
|
||||||
|
use 5.18.0;
|
||||||
|
use File::Slurp;
|
||||||
|
use File::Find ;
|
||||||
|
use Cwd ;
|
||||||
|
use utf8;
|
||||||
|
#use open IN => ":utf8", OUT => ":utf8";
|
||||||
|
use open IO => ":utf8";
|
||||||
|
|
||||||
|
open(LOG, ">Logs/Log.txt") or die "$!";
|
||||||
|
open(OUT, ">/Users/Henry/Documents/WACS/en_ulb_tagged/ULB_xml/ULB.xml") or die "$!";
|
||||||
|
say OUT "<xml>";
|
||||||
|
|
||||||
|
my ($topDir, $outDir) = ("/Users/Henry/Documents/WACS/en_ulb", "/Users/Henry/Documents/WACS/en_ulb_tagged/ULB_xml");
|
||||||
|
|
||||||
|
my @filesToRun = ();
|
||||||
|
my $filePattern = '\.usfm' ;
|
||||||
|
#my $filePattern = '67-REV\.usfm' ;
|
||||||
|
my $file;
|
||||||
|
find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ;
|
||||||
|
|
||||||
|
@filesToRun = sort @filesToRun;
|
||||||
|
|
||||||
|
ReadFiles();
|
||||||
|
|
||||||
|
say OUT "</xml>";
|
||||||
|
close OUT;
|
||||||
|
close LOG;
|
||||||
|
|
||||||
|
say "\nDone.";
|
||||||
|
# =====
|
||||||
|
sub ReadFiles {
|
||||||
|
|
||||||
|
foreach $file ( @filesToRun ) {
|
||||||
|
say $file;
|
||||||
|
my @array;
|
||||||
|
my $fileText = read_file("$file", binmode => 'utf8');
|
||||||
|
$fileText =~ s/[ \n]+$//;
|
||||||
|
say LOG "|$fileText|";
|
||||||
|
|
||||||
|
#Delete \n
|
||||||
|
my ($book, $chap, $vers, $chapStart);
|
||||||
|
if ($fileText =~ /\\h ([^\n]*)/) {
|
||||||
|
$book = $1
|
||||||
|
}
|
||||||
|
#say LOG $book;
|
||||||
|
$fileText =~ s/\n/ /g;
|
||||||
|
$fileText =~ s/ / /g;
|
||||||
|
$fileText =~ s/\\s5/\n$&/g;
|
||||||
|
$fileText =~ s/\\v/√/g;
|
||||||
|
while ($fileText =~ s/(√[^√\n]*)(√)/$1\n$2/) {}
|
||||||
|
$fileText =~ s/√/\\v/g;
|
||||||
|
$fileText =~ s/(\\id[^\n]*)\n/\t\t<heading>$1<\/heading>\n/;
|
||||||
|
$fileText =~ s/ +\n/\n/g;
|
||||||
|
$fileText =~ s/(\\(q\d?|pi?|m|n?b))\n/\n$1 /g;
|
||||||
|
#say LOG $fileText;
|
||||||
|
@array = split /\n/, $fileText;
|
||||||
|
$fileText = "";
|
||||||
|
foreach my $line (@array) {
|
||||||
|
chomp;
|
||||||
|
if ($line =~ /<book name="(.*?)">/) {$book = $1;}
|
||||||
|
if ($line =~ /\\c (\d+).* \\v (\d+)/) {
|
||||||
|
($chap, $vers) = ($1, $2);
|
||||||
|
$line = "\t\t<chapter name=\"$book $chap\">\n\t\t\t<verse name=\"$book $chap:$vers\">$line</verse>";
|
||||||
|
$line = "\t\t</chapter>\n$line" if $chapStart;
|
||||||
|
$chapStart = 1;
|
||||||
|
}
|
||||||
|
elsif ($line =~ /\\v (\d+)/) {
|
||||||
|
$vers = $1;
|
||||||
|
$line = "\t\t\t<verse name=\"$book $chap:$vers\">$line</verse>"
|
||||||
|
}
|
||||||
|
#say LOG "===\n<AA>\n$line";
|
||||||
|
$line =~ s/(<verse[^>]*>)(.*\\v \d+ )(.*)(<\/verse>)/$1\n\t\t\t\t<preText>$2<\/preText>\n\t\t\t\t<text>$3<\/text>\n\t\t\t$4/s;
|
||||||
|
#say LOG "===\n<BB>\n$line";
|
||||||
|
if ($line =~ /<text>.*<\/text>/p) {
|
||||||
|
say LOG "<-0>\t$line";
|
||||||
|
my ($pre, $match, $post) = (${^PREMATCH}, ${^MATCH}, ${^POSTMATCH});
|
||||||
|
#say LOG "<-1>\t\$pre: $pre,\n\$match: $match,\n\$post: $post";
|
||||||
|
$match = TagInternalUSFM ($match);
|
||||||
|
$line = $pre . $match . $post;
|
||||||
|
}
|
||||||
|
|
||||||
|
say LOG "---\n<CC>\n$line\n===";
|
||||||
|
$line =~ s# +</#</#g;
|
||||||
|
$fileText .= $line . "\n";
|
||||||
|
}
|
||||||
|
say OUT "\t<book name=\"$book\">\n$fileText\t\t</chapter>\n\t</book>";
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
sub TagInternalUSFM {
|
||||||
|
my ($line, $placeNum) = ($_[0], 1);
|
||||||
|
my %places;
|
||||||
|
#say LOG "Tagging internal USFM in \$line $line.";
|
||||||
|
while ($line =~ /(<text>.*)(\\f .*?\\f\*)(.*<\/text>)/g) {
|
||||||
|
#say LOG "<+1>\t$2";
|
||||||
|
$line =~ s/(<text>.*)(\\f .*?\\f\*)(.*<\/text>)/$1<place number="$placeNum"\/>$3/;
|
||||||
|
$places{$placeNum} = $2;
|
||||||
|
$placeNum ++;
|
||||||
|
}
|
||||||
|
#say LOG "<+2>\t$line";
|
||||||
|
while ($line =~ /(<text>.*)(\\qs .*?\\qs\*)(.*<\/text>)/g) {
|
||||||
|
#say LOG "<+3>\t$2";
|
||||||
|
$line =~ s/(<text>.*)(\\qs .*?\\qs\*)(.*<\/text>)/$1<place number="$placeNum"\/>$3/;
|
||||||
|
$places{$placeNum} = $2;
|
||||||
|
$placeNum ++;
|
||||||
|
}
|
||||||
|
#say LOG "<+4>\t$line";
|
||||||
|
while ($line =~ /(<text>.*)(\\([bm]|pi?|q\d?|s2))( .*<\/text>)/g) {
|
||||||
|
#say LOG "<+5>\t$2";
|
||||||
|
$line =~ s/(<text>.*)(\\([bm]|pi?|q\d?|s2))(.*<\/text>)/$1<place number="$placeNum"\/>$4/;
|
||||||
|
$places{$placeNum} = $2;
|
||||||
|
$placeNum ++;
|
||||||
|
}
|
||||||
|
#say LOG "<+6>\t$line";
|
||||||
|
$line =~ s/ / /g;
|
||||||
|
#say LOG "<+7>\t$line";
|
||||||
|
foreach my $place (sort keys %places) {
|
||||||
|
#say LOG "<+8>\tReplacing <place number=\"$place\"\/> with <usfm>$places{$place}<\/usfm> in\n$line.";
|
||||||
|
unless ($line =~ s/<place number="$place"\/>/<usfm>$places{$place}<\/usfm>/) {die}
|
||||||
|
}
|
||||||
|
say LOG "<+9>\t$line";
|
||||||
|
return $line;
|
||||||
|
}
|
|
@ -0,0 +1,113 @@
|
||||||
|
# Checks ULB in ULB.xml against tagged ULB
|
||||||
|
use 5.18.0;
|
||||||
|
use File::Slurp;
|
||||||
|
use File::Find ;
|
||||||
|
use Cwd ;
|
||||||
|
use utf8;
|
||||||
|
#use open IN => ":utf8", OUT => ":utf8";
|
||||||
|
use open IO => ":utf8";
|
||||||
|
|
||||||
|
open(LOG, ">Logs/Log.txt") or die "$!";
|
||||||
|
|
||||||
|
my ($ULBxml, $taggedULBDir) = ("/Users/Henry/Documents/WACS/Tips_and_Hacks/Tagged_OGNT/ULB_xml/ULB.xml", "/Users/Henry/Documents/WACS/Tips_and_Hacks/Tagged_OGNT/Manual_Tagging");
|
||||||
|
my (@filesToRun) = ();
|
||||||
|
my %fullName;
|
||||||
|
my $filePattern = "\.xml" ;
|
||||||
|
my $file;
|
||||||
|
|
||||||
|
my $xmlText = read_file("$ULBxml", binmode => 'utf8');
|
||||||
|
|
||||||
|
GetBooksToCheck();
|
||||||
|
Compare();
|
||||||
|
|
||||||
|
sub Compare {
|
||||||
|
foreach my $file (@filesToRun) {
|
||||||
|
|
||||||
|
say LOG "|$file|, |$taggedULBDir/$file.xml|";
|
||||||
|
my $taggedText = read_file("$taggedULBDir/$file.xml", binmode => 'utf8');
|
||||||
|
|
||||||
|
GetGist($file, $taggedText);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sub GetGist {
|
||||||
|
my ($fileName, $wholeTaggedText) = @_;
|
||||||
|
my ($verseRef, $standard, $tagged);
|
||||||
|
say LOG "|$fileName|, |$fullName{$fileName}|";
|
||||||
|
while ($wholeTaggedText =~ /<verse name="($fullName{$fileName} \d+:\d+)">((.|\n)*?)<preText>(.*?)<\/preText>((.|\n)*?)\n\t+((<w ((.|\n)*?)\n)*)\t+<\/verse>/sg) {
|
||||||
|
my ($preText, $gist) = ($4, $7);
|
||||||
|
$verseRef = $1;
|
||||||
|
say LOG "\$verseRef: |$verseRef|";
|
||||||
|
if ($xmlText =~ /<verse name="$verseRef">\n\t+<preText>([^\n]*)<\/preText>\n\t+<text>([^\n]*)<\/text>\n\t+<\/verse>/s) {
|
||||||
|
my ($standardPT, $standardT) = ($1, $2);
|
||||||
|
($tagged) = Untag($preText, $gist);
|
||||||
|
#say LOG $tagged;
|
||||||
|
$standard = $standardPT . " " . $standardT;
|
||||||
|
$standard =~ s/<[^<>]*>//g;
|
||||||
|
$standard =~ s/ {2,}/ /g;
|
||||||
|
$standard =~ s/ +$//;
|
||||||
|
if ($standard ne $tagged) {
|
||||||
|
say LOG "\nMISMATCH:\n\$standard\n$standard\n\$tagged\n$tagged\n"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sub Untag {
|
||||||
|
my ($pre, $txt) = ($_[0], $_[1]);
|
||||||
|
#say LOG "\$pre: $pre\n\$txt: $txt";
|
||||||
|
$txt =~ s/[\t\n]/ /g;
|
||||||
|
$txt =~ s/(√|<[^<>]*>)//g;
|
||||||
|
$txt = $pre . " " . $txt;
|
||||||
|
$txt =~ s/ {2,}/ /g;
|
||||||
|
$txt =~ s/ +$//;
|
||||||
|
return $txt;
|
||||||
|
}
|
||||||
|
|
||||||
|
sub GetBooksToCheck {
|
||||||
|
while (<DATA>) {
|
||||||
|
chomp;
|
||||||
|
unless (/^#/) {
|
||||||
|
if (/([^\t]*)\t([^\t]*)/) {
|
||||||
|
my ($file, $book) = ($1, $2);
|
||||||
|
say "|$file|";
|
||||||
|
push @filesToRun, "$file";
|
||||||
|
$fullName{$file} = $book;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
close LOG;
|
||||||
|
|
||||||
|
say "\nDone.";
|
||||||
|
# =====
|
||||||
|
__DATA__
|
||||||
|
#41-MAT Matthew
|
||||||
|
#42-MRK Mark
|
||||||
|
#43-LUK Luke
|
||||||
|
#44-JHN John
|
||||||
|
#45-ACT Acts
|
||||||
|
#46-ROM Romans
|
||||||
|
#47-1CO 1 Corinthians
|
||||||
|
#48-2CO 2 Corinthians
|
||||||
|
#49-GAL Galatians
|
||||||
|
#50-EPH Ephesians
|
||||||
|
#51-PHP Philippians
|
||||||
|
#52-COL Colossians
|
||||||
|
#53-1TH 1 Thessalonians
|
||||||
|
#54-2TH 2 Thessalonians
|
||||||
|
#55-1TI 1 Timothy
|
||||||
|
#56-2TI 2 Timothy
|
||||||
|
57-TIT Titus
|
||||||
|
#58-PHM Philemon
|
||||||
|
#59-HEB Hebrews
|
||||||
|
#60-JAS James
|
||||||
|
#61-1PE 1 Peter
|
||||||
|
#62-2PE 2 Peter
|
||||||
|
#63-1JN 1 John
|
||||||
|
#64-2JN 2 John
|
||||||
|
#65-3JN 3 John
|
||||||
|
#66-JUD Jude
|
||||||
|
#67-REV Revelation
|
|
@ -0,0 +1,410 @@
|
||||||
|
# Takes current tW entries and populates tagged OGNT XML
|
||||||
|
#
|
||||||
|
# This is the current best version
|
||||||
|
# Requires ULB that includes USFMs.
|
||||||
|
|
||||||
|
|
||||||
|
use 5.12.0;
|
||||||
|
use File::Slurp;
|
||||||
|
use File::Find ;
|
||||||
|
use Cwd ;
|
||||||
|
use utf8;
|
||||||
|
#use open IN => ":utf8", OUT => ":utf8";
|
||||||
|
use open IO => ":utf8";
|
||||||
|
$" = "\n";
|
||||||
|
|
||||||
|
mkdir "Logs";
|
||||||
|
open(LOG, ">Logs/tW_pairs.txt") or die "$!";
|
||||||
|
my $ULBfile = "/Users/Henry/Documents/WACS/en_ulb_tagged/ULB_xml/ULB.xml";
|
||||||
|
my $topDirOGNT = "/Users/Henry/Documents/WACS/OGNT";
|
||||||
|
#my $topDirOGNT = "/Users/Henry/Documents/WACS/en_ulb_tagged/Tag_test";
|
||||||
|
my $topDirtW = "/Users/Henry/Documents/WACS/en_tw/bible";
|
||||||
|
my ($outDir, $outFile) = ("/Users/Henry/Documents/WACS/en_ulb_tagged/Auto-tagged", "");
|
||||||
|
my ($ULBText, $workText, $language);
|
||||||
|
my ($file);
|
||||||
|
my (%ULBtextThisVerse, %ULBpreTextThisVerse, %SNsThisVerse, %entriesThisSN, %longName);
|
||||||
|
|
||||||
|
my @OGNTfilesToRun = ();
|
||||||
|
#my $filePattern = '\.xml' ;
|
||||||
|
my $filePattern = '52-COL\.xml' ;
|
||||||
|
find( sub { push @OGNTfilesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDirOGNT) ;
|
||||||
|
|
||||||
|
say LOG "\@OGNTfilesToRun:\n@OGNTfilesToRun\n";
|
||||||
|
|
||||||
|
my @tWfilesToRun = ();
|
||||||
|
$filePattern = '.md' ;
|
||||||
|
find( sub { push @tWfilesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDirtW) ;
|
||||||
|
|
||||||
|
Read_tW_Files();
|
||||||
|
|
||||||
|
close LOG;
|
||||||
|
open(LOG, ">Logs/Log.txt") or die "$!";
|
||||||
|
|
||||||
|
LongBookNames();
|
||||||
|
Prepare_ULB_file();
|
||||||
|
say LOG "Prepare_ULB_file done.\n\@OGNTfilesToRun:\n@OGNTfilesToRun\n";
|
||||||
|
|
||||||
|
ProcessXML();
|
||||||
|
# put unused SN at end of verse
|
||||||
|
|
||||||
|
close LOG;
|
||||||
|
|
||||||
|
say "\nDone.";
|
||||||
|
# =====
|
||||||
|
sub Read_tW_Files {
|
||||||
|
foreach $file ( @tWfilesToRun ) {
|
||||||
|
say LOG $file;
|
||||||
|
my (@sns);
|
||||||
|
my $entries;
|
||||||
|
my $fileText = read_file("$file", binmode => 'utf8');
|
||||||
|
if ($fileText =~ /\* Strong's: ([^\n]*)\n/) {
|
||||||
|
my $sns = $1;
|
||||||
|
#say LOG "\t$sns";
|
||||||
|
@sns = split /, /, $sns;
|
||||||
|
}
|
||||||
|
if ($fileText =~ /Forms Found in the English ULB:\n\n([^\n]*)\n/) {
|
||||||
|
$entries = $1;
|
||||||
|
die "$fileText" if $entries eq "";
|
||||||
|
#say LOG "\t\t$entries"
|
||||||
|
}
|
||||||
|
foreach my $sn (@sns) {
|
||||||
|
$entriesThisSN{$sn} .= $entries . ", ";
|
||||||
|
#say LOG "\t\t\t$sn: $entriesThisSN{$sn}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
foreach my $sn (sort keys %entriesThisSN) {
|
||||||
|
#say LOG "$sn: $entriesThisSN{$sn}";
|
||||||
|
my @entries = split /, /, $entriesThisSN{$sn};
|
||||||
|
@entries = reverse sort { substr($a,0,1) <=> substr($b,0,1)
|
||||||
|
|| length($a) <=> length($b)
|
||||||
|
|| $a <=> $b }
|
||||||
|
@entries;
|
||||||
|
$entriesThisSN{$sn} = "";
|
||||||
|
foreach my $slice (@entries) {
|
||||||
|
$entriesThisSN{$sn} .= "$slice, "
|
||||||
|
}
|
||||||
|
$entriesThisSN{$sn} =~ s/, $//;
|
||||||
|
say LOG "$sn: $entriesThisSN{$sn}";
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
sub LongBookNames {
|
||||||
|
while (<DATA>) {
|
||||||
|
chomp;
|
||||||
|
if (/([^\t]*)\t([^\t]*)\t([^\t]*)/) {
|
||||||
|
$longName{$2} = $3
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sub ProcessXML {
|
||||||
|
foreach my $file (@OGNTfilesToRun) {
|
||||||
|
my $greekText;
|
||||||
|
my $fileGist;
|
||||||
|
if ($file =~ /((..)....\.xml)/) {
|
||||||
|
($fileGist, $language) = ($1, $2);
|
||||||
|
if ($language > 40) {
|
||||||
|
$language = "G"
|
||||||
|
} else {$language = "H"}
|
||||||
|
|
||||||
|
}
|
||||||
|
say LOG "<0>\t$file \t$fileGist";
|
||||||
|
open(OUT, ">$outDir/$fileGist") or die "$outDir/$fileGist: $!";
|
||||||
|
my ($pre, $gist, $post, $bk, $ch, $vs, $thisVerse, $staticText, $residueText, $matchedLines, $flag, $thisVerseForOutput,
|
||||||
|
$linesWithRelevantSNs, $linesNotMatched, $orderedOutputLines, $linesToSkip, $thisPreText);
|
||||||
|
open (my $thisFile, "<:utf8", "$file") or die "$file:\n$!";
|
||||||
|
my ($originalLinesCount, $rsnCount, $skipCount, $noRSNCount, $outCount);
|
||||||
|
while (my $thisLine = <$thisFile>) {
|
||||||
|
chomp $thisLine;
|
||||||
|
if ($thisLine =~ /<verse/) {say LOG "\n=========================="}
|
||||||
|
if ($thisLine =~ /<\/verse>/) {
|
||||||
|
say LOG "<0.1>\t$thisLine";
|
||||||
|
say LOG "<11>\n\$linesWithRelevantSNs\n$linesWithRelevantSNs\n\$linesToSkip\n$linesToSkip\$residueText\n$residueText";
|
||||||
|
|
||||||
|
($matchedLines, $residueText, $linesNotMatched) = ProcessRelevantSNs($linesWithRelevantSNs, $staticText, $residueText);
|
||||||
|
say LOG "<14>\t\$matchedLines\n$matchedLines\n\$linesNotMatched\n$linesNotMatched";
|
||||||
|
my %orderedLine;
|
||||||
|
$matchedLines =~ s/\n{2,}/\n/gs;
|
||||||
|
say LOG "<15\tBefore sort of \$matchedLines:\n$matchedLines\n";
|
||||||
|
while ($matchedLines =~ /([^◊]*)◊(\d*)\n/g) {
|
||||||
|
$orderedLine{$2} = $1;
|
||||||
|
say LOG "<5>\t\$2: $2\t\$1: $1";
|
||||||
|
}
|
||||||
|
$matchedLines = "";
|
||||||
|
foreach my $line (sort {$a <=> $b} keys %orderedLine) {
|
||||||
|
say LOG "<5.5>\t\$line: $line\t\$orderedLine{$line}: $orderedLine{$line}";
|
||||||
|
$matchedLines .= "$orderedLine{$line}\n"
|
||||||
|
}
|
||||||
|
chomp $matchedLines;
|
||||||
|
say LOG "<16>\tAfter sort of \$matchedLines:\n$matchedLines\n\$linesNotMatched\n$linesNotMatched";
|
||||||
|
$residueText =~ s/<usfm>.*?<\/usfm>//g;
|
||||||
|
$residueText =~ s/(^q | q$)//g;
|
||||||
|
$residueText =~ s/ {3,}/ /g;
|
||||||
|
$residueText =~ s/^ +//;
|
||||||
|
$residueText =~ s/ +$/$1/;
|
||||||
|
$greekText =~ s/^ +//;
|
||||||
|
$greekText =~ s/ +$/$1/;
|
||||||
|
$staticText =~ s/^ +//;
|
||||||
|
$staticText =~ s/ +$/$1/;
|
||||||
|
my $internalUSFM;
|
||||||
|
$internalUSFM .= "\t\t\t\t\t$&\n" while ($staticText =~ /<usfm>.*?<\/usfm>/g);
|
||||||
|
$linesNotMatched =~ s/\n+$//;
|
||||||
|
$linesToSkip =~ s/\n+$//;
|
||||||
|
$matchedLines =~ s/^\n+//;
|
||||||
|
$internalUSFM =~ s/\n+$//;
|
||||||
|
say LOG "<17>\tAfter pruning \$matchedLines:\n$matchedLines\n\$linesNotMatched\n$linesNotMatched";
|
||||||
|
say OUT "\t\t\t\t\t<Greek>$greekText</Greek>";
|
||||||
|
say OUT "\t\t\t\t\t<preText>$thisPreText</preText>";
|
||||||
|
say OUT "\t\t\t\t\t<ULB>$staticText</ULB>";
|
||||||
|
say OUT "\t\t\t\t\t<residue>$residueText</residue>";
|
||||||
|
say OUT "$matchedLines" unless ($matchedLines eq "");
|
||||||
|
say OUT "$linesNotMatched" unless ($linesNotMatched eq "");
|
||||||
|
say OUT "$internalUSFM" unless ($internalUSFM eq "");
|
||||||
|
say OUT "$linesToSkip" if ($linesToSkip);
|
||||||
|
say OUT "$thisLine";
|
||||||
|
($originalLinesCount, $rsnCount, $skipCount, $noRSNCount, $outCount) = ();
|
||||||
|
($thisVerseForOutput, $flag, $workText, $greekText, $linesNotMatched, $linesToSkip, $residueText, $orderedOutputLines, $linesWithRelevantSNs) = ();
|
||||||
|
($linesToSkip) = ("");
|
||||||
|
}
|
||||||
|
elsif ($thisLine =~ /<w /) {
|
||||||
|
say LOG "<0.2>\t$thisLine";
|
||||||
|
$originalLinesCount ++;
|
||||||
|
if ($thisLine =~ />([^\n<>]*)</) {
|
||||||
|
$greekText .= $1 . " "
|
||||||
|
}
|
||||||
|
$thisLine =~ s/(<w .*)>([^<]*)(<\/w>)/$1 text="$2">$3/;
|
||||||
|
if ($thisLine =~ /lemma="(\d+)"/) {
|
||||||
|
my $thisLemma = $language . $1;
|
||||||
|
if (exists $entriesThisSN{$thisLemma}) {
|
||||||
|
$rsnCount ++;
|
||||||
|
$linesWithRelevantSNs .= $thisLine . "\n";
|
||||||
|
say LOG "<0.2.1>\t\$thisLemma: $thisLemma; line pushed to \$linesWithRelevantSNs";
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
$skipCount ++;
|
||||||
|
$thisLine =~ s/><\/w>/>√<\/w>/;
|
||||||
|
$linesToSkip .= "$thisLine\n";
|
||||||
|
#say LOG "<0.2.2>\t\$thisLemma: $thisLemma; line pushed to \@LinesToSkip";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
elsif ($thisLine =~ /<verse osisID="(.*?)\.(.*?)\.(.*?)">/) {
|
||||||
|
say LOG "<0.3>\t$thisLine";
|
||||||
|
($bk, $ch, $vs) = ($1,$2,$3);
|
||||||
|
($thisVerse, $greekText) = ("$longName{$bk} $ch:$vs", "");
|
||||||
|
$staticText = $ULBtextThisVerse{$thisVerse};
|
||||||
|
$residueText = "q $staticText q";
|
||||||
|
$thisPreText = $ULBpreTextThisVerse{$thisVerse};
|
||||||
|
say OUT "\t\t\t\t<verse name=\"$thisVerse\">";
|
||||||
|
($flag) = (1);
|
||||||
|
}
|
||||||
|
else {say OUT $thisLine}
|
||||||
|
}
|
||||||
|
|
||||||
|
close $thisFile;
|
||||||
|
close OUT;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sub ProcessRelevantSNs {
|
||||||
|
my ($relevantLines, $staticText, $residueText, $linesNotMatched) = (@_);
|
||||||
|
my ($matchedLines, $thisLine);
|
||||||
|
my @relevantLines = split /\n/, $relevantLines;
|
||||||
|
foreach my $line (@relevantLines) {
|
||||||
|
if ($line =~ /lemma="(\d+)"/) {
|
||||||
|
my $thisSN = $language . $1;
|
||||||
|
say LOG "<12>\t\$line: $line, \$thisSN: $thisSN, \$entriesThisSN{$thisSN}\n$entriesThisSN{$thisSN}";
|
||||||
|
($thisLine, $residueText, $linesNotMatched) = MatchAndPlace($line, $thisSN, $staticText, $residueText, $linesNotMatched);
|
||||||
|
$thisLine =~ s/[ \t]+$//;
|
||||||
|
$matchedLines .= $thisLine . "\n";
|
||||||
|
$matchedLines =~ s/\n{2,}$/\n/s;
|
||||||
|
say LOG "<13>\t\$matchedLines\n$matchedLines\n\$linesNotMatched\n$linesNotMatched+++"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ($matchedLines, $residueText, $linesNotMatched);
|
||||||
|
}
|
||||||
|
sub MatchAndPlace {
|
||||||
|
my ($line, $sn, $staticText, $workText, $linesNotMatched) = @_;
|
||||||
|
#say LOG "<8>\t\$line: $line \$sn: $sn \$workText\n$workText";
|
||||||
|
my ($workEntry, $found, $matchedLines, $first, $second, $third, $firstLen, $secondLen, $thirdLen);
|
||||||
|
my @entries = split /, /, $entriesThisSN{$sn};
|
||||||
|
foreach my $entry (@entries) {
|
||||||
|
my $entryType;
|
||||||
|
if ($entry =~ /^(.*) \.\.\. (.*) \.\.\. (.*)$/) {
|
||||||
|
($first, $second, $third) = ($1, $2, $3);
|
||||||
|
($firstLen, $secondLen, $thirdLen) = (length $first, length $second, length $third);
|
||||||
|
$workEntry = "\\b" . $first . "\\b" . ".*?" . "\\b" . $second . "\\b" . ".*?" . "\\b" . $third;
|
||||||
|
say LOG "<1a>\t\$first: $first, \$second: $second, \$third: $third, \$firstLen: $firstLen, \$secondLen,: $secondLen, \$thirdLen: $thirdLen \$entry: |$entry|\t\$workEntry: |$workEntry|";
|
||||||
|
$entryType = 1;
|
||||||
|
}
|
||||||
|
elsif ($entry =~ /^(.*) \.\.\. (.*)$/) {
|
||||||
|
($first, $second) = ($1, $2);
|
||||||
|
($firstLen, $secondLen) = (length $first, length $second);
|
||||||
|
$workEntry = "\\b" . $first . "\\b" . ".*?" . "\\b" . $second . "\\b";
|
||||||
|
say LOG "<2a>\t\$first: $first, \$second: $second, \$third: $third, \$firstLen: $firstLen, \$secondLen,: $secondLen, \$entry: |$entry|\t\$workEntry: |$workEntry|";
|
||||||
|
$entryType = 2;
|
||||||
|
}
|
||||||
|
else {$workEntry = $entry;}
|
||||||
|
|
||||||
|
my $foundText;
|
||||||
|
#say LOG "<8.1>\t\$entryType: $entryType\t\$entry: $entry\t\$workEntry: $workEntry";
|
||||||
|
if ($workText =~ /\b$workEntry\b/p) {
|
||||||
|
say LOG "<8.1>Found: \t\$entryType: $entryType\t\$entry: $entry\t\$workEntry: $workEntry";
|
||||||
|
($foundText, $workText) = ($&, "${^PREMATCH}ı${^POSTMATCH}");
|
||||||
|
my ($place, $foundTextLength, $replacementSpaces) = (length ${^PREMATCH}, length $foundText, "");
|
||||||
|
$line =~ s/></>$entry</;
|
||||||
|
while (length $replacementSpaces < $foundTextLength) {$replacementSpaces .= " "}
|
||||||
|
|
||||||
|
if ($entryType) {
|
||||||
|
|
||||||
|
say LOG "<8.2>\n\$workText,: $workText, \$matchedLines:\n$matchedLines ";
|
||||||
|
|
||||||
|
($workText) = FixWorkText($line, $workText, $workEntry, $foundText, $foundTextLength, $first, $firstLen, $second, $secondLen, $third, $thirdLen);
|
||||||
|
|
||||||
|
say LOG "<8.3>\n\$workText:\n$workText\n\$matchedLines:\n$matchedLines";
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
else {$workText =~ s/ı/$replacementSpaces/;}
|
||||||
|
|
||||||
|
$matchedLines .= "$line◊$place";
|
||||||
|
|
||||||
|
say LOG "<8.4>\tAfter found, new \$workText:\n$workText";
|
||||||
|
$found = 1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
#say LOG "\$workEntry $workEntry not found"
|
||||||
|
}
|
||||||
|
if ($found) {
|
||||||
|
last
|
||||||
|
}
|
||||||
|
}
|
||||||
|
unless ($found) {
|
||||||
|
$line =~ s/></>?</;
|
||||||
|
$linesNotMatched .= "$line\n"
|
||||||
|
}
|
||||||
|
return ($matchedLines, $workText, $linesNotMatched)
|
||||||
|
}
|
||||||
|
sub FixWorkText {
|
||||||
|
my ($thisLine, $text, $entry, $foundText, $foundTextLength, $first, $firstLen, $second, $secondLen, $third, $thirdLen) = @_;
|
||||||
|
my ($firstSpace, $secondSpace, $thirdSpace);
|
||||||
|
while (length $firstSpace < $firstLen) {$firstSpace .= " "}
|
||||||
|
while (length $secondSpace < $secondLen) {$secondSpace .= " "}
|
||||||
|
while (length $thirdSpace < $thirdLen) {$thirdSpace .= " "}
|
||||||
|
|
||||||
|
say LOG "<9>\$text:\n$text\n\t\t\$entry: $entry \$foundText: $foundText\t \$foundTextLength: $foundTextLength\t\$first: $first\t\$second: $second\t\$third: $third\n\$firstSpace: $firstSpace\t\$secondSpace: $secondSpace\t\$thirdSpace: $thirdSpace";
|
||||||
|
if ($third) {
|
||||||
|
if ($foundText =~ /$first(.*)$second(.*)$third/) {
|
||||||
|
my ($firstGap, $secondGap) = ($1, $2);
|
||||||
|
my $repText = "$firstSpace$firstGap$secondSpace$secondGap$thirdSpace";
|
||||||
|
say LOG "<9.1> \$repText: $repText";
|
||||||
|
$text =~ s/ı/$repText/;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if ($foundText =~ /$first(.*)$second/) {
|
||||||
|
my ($firstGap) = ($1);
|
||||||
|
say LOG "<9.2>\t\$firstSpace: |$firstSpace|\t\$firstGap: |$firstGap|\t\$secondSpace: |$secondSpace|";
|
||||||
|
my $repText ="$firstSpace$firstGap$secondSpace";
|
||||||
|
say LOG "<9.3> \$repText: |$repText|";
|
||||||
|
$text =~ s/ı/$repText/;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ($text)
|
||||||
|
}
|
||||||
|
sub Prepare_ULB_file {
|
||||||
|
|
||||||
|
my $thisVerse;
|
||||||
|
#$ULBText = read_file($ULBfile, binmode => 'utf8');
|
||||||
|
|
||||||
|
#while ($ULBText =~ /<verse name="(.*?)">\n<preText>(.*?)<\/preText>\n.*<text>(.*?)<\/text>.*<\/verse>/sg) {
|
||||||
|
# ($ULBtextThisVerse{$1}, $ULBpreTextThisVerse{$1}) = ($3, $2);
|
||||||
|
#}
|
||||||
|
#
|
||||||
|
|
||||||
|
open (my $file, "<:utf8", "$ULBfile") or die "$ULBfile:\n$!";
|
||||||
|
|
||||||
|
while (my $thisLine = <$file>) {
|
||||||
|
chomp $thisLine;
|
||||||
|
if ($thisLine =~ /verse name="(.*?)"/) {
|
||||||
|
$thisVerse = $1;
|
||||||
|
#say LOG "$thisVerse:\n$thisLine"
|
||||||
|
}
|
||||||
|
elsif ($thisLine =~ /<preText>(.*?)<\/preText>/) {
|
||||||
|
$ULBpreTextThisVerse{$thisVerse} = $1;
|
||||||
|
#say LOG "$thisVerse:\n$ULBpreTextThisVerse{$thisVerse}"
|
||||||
|
}
|
||||||
|
elsif ($thisLine =~ /<text>(.*?)<\/text>/) {
|
||||||
|
$ULBtextThisVerse{$thisVerse} = $1;
|
||||||
|
#say LOG "$thisVerse:\n$ULBtextThisVerse{$thisVerse}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
close $file;
|
||||||
|
}
|
||||||
|
|
||||||
|
__DATA__
|
||||||
|
01 gen Genesis
|
||||||
|
02 exo Exodus
|
||||||
|
03 lev Leviticus
|
||||||
|
04 num Numbers
|
||||||
|
05 deu Deuteronomy
|
||||||
|
06 jos Joshua
|
||||||
|
07 jdg Judges
|
||||||
|
08 rut Ruth
|
||||||
|
09 1sa 1 Samuel
|
||||||
|
10 2sa 2 Samuel
|
||||||
|
11 1ki 1 Kings
|
||||||
|
12 2ki 2 Kings
|
||||||
|
13 1ch 1 Chronicles
|
||||||
|
14 2ch 2 Chronicles
|
||||||
|
15 ezr Ezra
|
||||||
|
16 neh Nehemiah
|
||||||
|
17 est Esther
|
||||||
|
18 job Job
|
||||||
|
19 psa Psalms
|
||||||
|
20 pro Proverbs
|
||||||
|
21 ecc Ecclesiastes
|
||||||
|
22 sng Song of Solomon
|
||||||
|
23 isa Isaiah
|
||||||
|
24 jer Jeremiah
|
||||||
|
25 lam Lamentations
|
||||||
|
26 ezk Ezekiel
|
||||||
|
27 dan Daniel
|
||||||
|
28 hos Hosea
|
||||||
|
29 jol Joel
|
||||||
|
30 amo Amos
|
||||||
|
31 oba Obadiah
|
||||||
|
32 jon Jonah
|
||||||
|
33 mic Micah
|
||||||
|
34 nam Nahum
|
||||||
|
35 hab Habakkuk
|
||||||
|
36 zep Zephaniah
|
||||||
|
37 hag Haggai
|
||||||
|
38 zec Zechariah
|
||||||
|
39 mal Malachi
|
||||||
|
41 mat Matthew
|
||||||
|
42 mrk Mark
|
||||||
|
43 luk Luke
|
||||||
|
44 jhn John
|
||||||
|
45 act Acts
|
||||||
|
46 rom Romans
|
||||||
|
47 1co 1 Corinthians
|
||||||
|
48 2co 2 Corinthians
|
||||||
|
49 gal Galatians
|
||||||
|
50 eph Ephesians
|
||||||
|
51 php Philippians
|
||||||
|
52 col Colossians
|
||||||
|
53 1th 1 Thessalonians
|
||||||
|
54 2th 2 Thessalonians
|
||||||
|
55 1ti 1 Timothy
|
||||||
|
56 2ti 2 Timothy
|
||||||
|
57 tit Titus
|
||||||
|
58 phm Philemon
|
||||||
|
59 heb Hebrews
|
||||||
|
60 jas James
|
||||||
|
61 1pe 1 Peter
|
||||||
|
62 2pe 2 Peter
|
||||||
|
63 1jn 1 John
|
||||||
|
64 2jn 2 John
|
||||||
|
65 3jn 3 John
|
||||||
|
66 jud Jude
|
||||||
|
67 rev Revelation
|
|
@ -0,0 +1,59 @@
|
||||||
|
use 5.12.0;
|
||||||
|
use File::Slurp;
|
||||||
|
use File::Find ;
|
||||||
|
use Cwd ;
|
||||||
|
|
||||||
|
my %location;
|
||||||
|
|
||||||
|
open LOG, ">log/log.log" or die;
|
||||||
|
|
||||||
|
open(IN, "/Users/Henry/Documents/WACS/Tips_and_Hacks/MAST_tW_PDF_Updater/FilesForUpdates/Temp/ULB_text.txt") or die "$!";
|
||||||
|
|
||||||
|
say "Reading ULB";
|
||||||
|
|
||||||
|
while (<IN>) {
|
||||||
|
#print LOG "$_";
|
||||||
|
chomp;
|
||||||
|
while (s/^([^\n\t]*)\t([^\n]*?)([A-Z][a-z]+(-[A-Z][a-z]+)?)/$1\t$2/) {
|
||||||
|
# say LOG $3;
|
||||||
|
unless (exists $location{$3}) {$location{$3} = $1}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
close IN;
|
||||||
|
|
||||||
|
say "Outputting hash";
|
||||||
|
|
||||||
|
open(OUT, ">out/results.txt") or die "$!";
|
||||||
|
|
||||||
|
foreach my $word (sort keys %location) {
|
||||||
|
say OUT "$word, $location{$word}";
|
||||||
|
}
|
||||||
|
|
||||||
|
close OUT;
|
||||||
|
|
||||||
|
say "Deleting common words";
|
||||||
|
|
||||||
|
my $fileText = read_file("/Users/Henry/Documents/WACS/Tips_and_Hacks/MAST_tW_PDF_Updater/FilesForUpdates/Temp/ULB_text.txt", binmode => 'utf8');
|
||||||
|
|
||||||
|
foreach my $word (sort keys %location) {
|
||||||
|
my $temp = lc $word;
|
||||||
|
#say LOG $word . "\t" . $temp;
|
||||||
|
if ($fileText =~ /\b$temp\b/) {
|
||||||
|
delete $location{$word}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
say "Outputting final product";
|
||||||
|
|
||||||
|
open(OUT, ">out/results.txt") or die "$!";
|
||||||
|
|
||||||
|
foreach my $word (sort keys %location) {
|
||||||
|
say OUT "$word, $location{$word}";
|
||||||
|
}
|
||||||
|
|
||||||
|
close OUT;
|
||||||
|
|
||||||
|
|
||||||
|
close LOG;
|
|
@ -0,0 +1,365 @@
|
||||||
|
use 5.12.0;
|
||||||
|
use File::Slurp;
|
||||||
|
use File::Find ;
|
||||||
|
use Cwd ;
|
||||||
|
|
||||||
|
my %value;
|
||||||
|
|
||||||
|
open LOG, ">log/log.log" or die;
|
||||||
|
|
||||||
|
while (<DATA>) {
|
||||||
|
chomp;
|
||||||
|
if (/^(.*)$/) {
|
||||||
|
$value{$1} = $1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
my $fileText = read_file("/Users/Henry/Documents/WACS/Tips_and_Hacks/MAST_tW_PDF_Updater/FilesForUpdates/Temp/ULB_text.txt", binmode => 'utf8');
|
||||||
|
|
||||||
|
$fileText = "\n" . $fileText;
|
||||||
|
|
||||||
|
foreach my $thisWord (sort keys %value) {
|
||||||
|
print $thisWord . "\n";
|
||||||
|
if ($fileText =~ /\n([^\n\t]*)\t[^\n]*$thisWord\b/) {
|
||||||
|
say LOG $thisWord . ", " . $1;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
close LOG;
|
||||||
|
|
||||||
|
__DATA__
|
||||||
|
Aaron
|
||||||
|
Abel
|
||||||
|
Abiathar
|
||||||
|
Abijah
|
||||||
|
Abimelek
|
||||||
|
Abner
|
||||||
|
Abraham
|
||||||
|
Absalom
|
||||||
|
Adam
|
||||||
|
Adonijah
|
||||||
|
Ahab
|
||||||
|
Ahaz
|
||||||
|
Ahaziah
|
||||||
|
Ahijah
|
||||||
|
Ai
|
||||||
|
Amalek
|
||||||
|
Amaziah
|
||||||
|
Ammon
|
||||||
|
Amnon
|
||||||
|
Amorite
|
||||||
|
Amos
|
||||||
|
Amoz
|
||||||
|
Andrew
|
||||||
|
Annas
|
||||||
|
Antioch
|
||||||
|
Apollos
|
||||||
|
Aquila
|
||||||
|
Arabah
|
||||||
|
Arabia
|
||||||
|
Aram
|
||||||
|
Ararat
|
||||||
|
Artaxerxes
|
||||||
|
Asa
|
||||||
|
Asaph
|
||||||
|
Ashdod
|
||||||
|
Asher
|
||||||
|
Asherah
|
||||||
|
Ashkelon
|
||||||
|
Asia
|
||||||
|
Assyria
|
||||||
|
Athaliah
|
||||||
|
Azariah
|
||||||
|
Baal
|
||||||
|
Baasha
|
||||||
|
Babel
|
||||||
|
Babylon
|
||||||
|
Balaam
|
||||||
|
Barabbas
|
||||||
|
Barnabas
|
||||||
|
Bartholomew
|
||||||
|
Baruch
|
||||||
|
Bashan
|
||||||
|
Bathsheba
|
||||||
|
Beelzebul
|
||||||
|
Beersheba
|
||||||
|
Benaiah
|
||||||
|
Benjamin
|
||||||
|
Berea
|
||||||
|
Beth Shemesh
|
||||||
|
Bethany
|
||||||
|
Bethel
|
||||||
|
Bethlehem
|
||||||
|
Bethuel
|
||||||
|
Boaz
|
||||||
|
Caesar
|
||||||
|
Caesarea
|
||||||
|
Caiaphas
|
||||||
|
Cain
|
||||||
|
Caleb
|
||||||
|
Cana
|
||||||
|
Canaan
|
||||||
|
Capernaum
|
||||||
|
Carmel
|
||||||
|
Chaldea
|
||||||
|
Cilicia
|
||||||
|
Colossae
|
||||||
|
Corinth
|
||||||
|
Cornelius
|
||||||
|
Crete
|
||||||
|
Cush
|
||||||
|
Cyprus
|
||||||
|
Cyrene
|
||||||
|
Cyrus
|
||||||
|
Damascus
|
||||||
|
Dan
|
||||||
|
Daniel
|
||||||
|
Darius
|
||||||
|
David
|
||||||
|
Delilah
|
||||||
|
Eden
|
||||||
|
Edom
|
||||||
|
Egypt
|
||||||
|
Ekron
|
||||||
|
Elam
|
||||||
|
Eleazar
|
||||||
|
Eliakim
|
||||||
|
Elijah
|
||||||
|
Elisha
|
||||||
|
Elizabeth
|
||||||
|
En Gedi
|
||||||
|
Enoch
|
||||||
|
Ephesus
|
||||||
|
Ephraim
|
||||||
|
Ephrath
|
||||||
|
Esau
|
||||||
|
Esther
|
||||||
|
Ethiopia
|
||||||
|
Euphrates River
|
||||||
|
Eve
|
||||||
|
Ezekiel
|
||||||
|
Ezra
|
||||||
|
Gabriel
|
||||||
|
Gad
|
||||||
|
Galatia
|
||||||
|
Galilee
|
||||||
|
Gath
|
||||||
|
Gaza
|
||||||
|
Gerar
|
||||||
|
Geshur
|
||||||
|
Gethsemane
|
||||||
|
Gibeah
|
||||||
|
Gibeon
|
||||||
|
Gideon
|
||||||
|
Gilead
|
||||||
|
Gilgal
|
||||||
|
Girgashites
|
||||||
|
Golgotha
|
||||||
|
Goliath
|
||||||
|
Gomorrah
|
||||||
|
Goshen
|
||||||
|
Greece
|
||||||
|
Greek
|
||||||
|
Habakkuk
|
||||||
|
Hagar
|
||||||
|
Haggai
|
||||||
|
Ham
|
||||||
|
Hamath
|
||||||
|
Hamor
|
||||||
|
Hananiah
|
||||||
|
Hannah
|
||||||
|
Haran
|
||||||
|
Hebron
|
||||||
|
Hermon
|
||||||
|
Herod
|
||||||
|
Herodias
|
||||||
|
Hezekiah
|
||||||
|
Hilkiah
|
||||||
|
Hittite
|
||||||
|
Hivite
|
||||||
|
Horeb
|
||||||
|
Hosea
|
||||||
|
Hoshea
|
||||||
|
Iconium
|
||||||
|
Isaac
|
||||||
|
Isaiah
|
||||||
|
Ishmael
|
||||||
|
Issachar
|
||||||
|
Jacob
|
||||||
|
James
|
||||||
|
Japheth
|
||||||
|
Jebus
|
||||||
|
Jehoiachin
|
||||||
|
Jehoiada
|
||||||
|
Jehoiakim
|
||||||
|
Jehoram
|
||||||
|
Jehoshaphat
|
||||||
|
Jehu
|
||||||
|
Jephthah
|
||||||
|
Jeremiah
|
||||||
|
Jericho
|
||||||
|
Jeroboam
|
||||||
|
Jerusalem
|
||||||
|
Jesse
|
||||||
|
Jethro
|
||||||
|
Jezebel
|
||||||
|
Jezreel
|
||||||
|
Joab
|
||||||
|
Joash
|
||||||
|
Job
|
||||||
|
Joel
|
||||||
|
John
|
||||||
|
John Mark
|
||||||
|
Jonah
|
||||||
|
Jonathan
|
||||||
|
Joppa
|
||||||
|
Joram
|
||||||
|
Jordan River
|
||||||
|
Joseph
|
||||||
|
Joshua
|
||||||
|
Josiah
|
||||||
|
Jotham
|
||||||
|
Judah
|
||||||
|
Judas.*Iscariot
|
||||||
|
Judas son of James
|
||||||
|
Judea
|
||||||
|
Kadesh
|
||||||
|
Kedar
|
||||||
|
Kedesh
|
||||||
|
Kerethites
|
||||||
|
Kidron Valley
|
||||||
|
Korah
|
||||||
|
Laban
|
||||||
|
Lamech
|
||||||
|
Lazarus
|
||||||
|
Leah
|
||||||
|
Lebanon
|
||||||
|
Levi
|
||||||
|
Leviathan
|
||||||
|
Lot
|
||||||
|
Luke
|
||||||
|
Lystra
|
||||||
|
Maakah
|
||||||
|
Macedonia
|
||||||
|
Maker
|
||||||
|
Malachi
|
||||||
|
Manasseh
|
||||||
|
Martha
|
||||||
|
Mary
|
||||||
|
Mary
|
||||||
|
Mary.*Magdalene
|
||||||
|
Matthew
|
||||||
|
Mede
|
||||||
|
Melchizedek
|
||||||
|
Memphis
|
||||||
|
Meshech
|
||||||
|
Mesopotamia
|
||||||
|
Micah
|
||||||
|
Michael
|
||||||
|
Midian
|
||||||
|
Miriam
|
||||||
|
Mishael
|
||||||
|
Mizpah
|
||||||
|
Moab
|
||||||
|
Molech
|
||||||
|
Mordecai
|
||||||
|
Moses
|
||||||
|
Mount of Olives
|
||||||
|
Naaman
|
||||||
|
Nahor
|
||||||
|
Nahum
|
||||||
|
Naphtali
|
||||||
|
Nathan
|
||||||
|
Nazareth
|
||||||
|
Nebuchadnezzar
|
||||||
|
Negev
|
||||||
|
Nehemiah
|
||||||
|
Nile River
|
||||||
|
Nineveh
|
||||||
|
Noah
|
||||||
|
Obadiah
|
||||||
|
Omri
|
||||||
|
Paddan Aram
|
||||||
|
Paran
|
||||||
|
Paul
|
||||||
|
Peor
|
||||||
|
Perizzite
|
||||||
|
Persia
|
||||||
|
Peter
|
||||||
|
Pharaoh
|
||||||
|
Philip
|
||||||
|
Philippi
|
||||||
|
Philistia
|
||||||
|
Philistines
|
||||||
|
Phinehas
|
||||||
|
Phoenicia
|
||||||
|
Pilate
|
||||||
|
Pontus
|
||||||
|
Potiphar
|
||||||
|
Priscilla
|
||||||
|
Rabbah
|
||||||
|
Rachel
|
||||||
|
Rahab
|
||||||
|
Ramah
|
||||||
|
Ramoth
|
||||||
|
Rebekah
|
||||||
|
Rehoboam
|
||||||
|
Reuben
|
||||||
|
Rimmon
|
||||||
|
Rome
|
||||||
|
Ruth
|
||||||
|
Salt Sea
|
||||||
|
Samaria
|
||||||
|
Samson
|
||||||
|
Samuel
|
||||||
|
Sarah
|
||||||
|
Saul
|
||||||
|
Sea of Galilee
|
||||||
|
Sea of Reeds
|
||||||
|
Sennacherib
|
||||||
|
Seth
|
||||||
|
Sharon
|
||||||
|
Sheba
|
||||||
|
Shechem
|
||||||
|
Shem
|
||||||
|
Shiloh
|
||||||
|
Shimei
|
||||||
|
Shinar
|
||||||
|
Sidon
|
||||||
|
Silas
|
||||||
|
Simeon
|
||||||
|
Simon the Zealot
|
||||||
|
Sinai
|
||||||
|
Sodom
|
||||||
|
Solomon
|
||||||
|
Stephen
|
||||||
|
Sukkoth
|
||||||
|
Syria
|
||||||
|
Tamar
|
||||||
|
Tarshish
|
||||||
|
Tarsus
|
||||||
|
Terah
|
||||||
|
Thessalonica
|
||||||
|
Thomas
|
||||||
|
Timothy
|
||||||
|
Tirzah
|
||||||
|
Titus
|
||||||
|
Troas
|
||||||
|
Tubal
|
||||||
|
Tychicus
|
||||||
|
Tyre
|
||||||
|
Ur
|
||||||
|
Uriah
|
||||||
|
Uzziah
|
||||||
|
Vashti
|
||||||
|
Xerxes
|
||||||
|
Zacchaeus
|
||||||
|
Zadok
|
||||||
|
Zebedee
|
||||||
|
Zebulun
|
||||||
|
Zechariah
|
||||||
|
Zedekiah
|
||||||
|
Zephaniah
|
||||||
|
Zerubbabel
|
||||||
|
Zoar
|
|
@ -0,0 +1,24 @@
|
||||||
|
$pre = "https://biblehub.com/greek/";
|
||||||
|
$var = "a, b, c, d, e, f, g, h";
|
||||||
|
$post = ".htm";
|
||||||
|
@array = split (/, /, $var);
|
||||||
|
|
||||||
|
#$out = system "curl --fail https://biblehub.com/greek/2.htm";
|
||||||
|
#print "\n\n\t\$out: $out";
|
||||||
|
|
||||||
|
open OUT, ">out/results.txt";
|
||||||
|
|
||||||
|
foreach $xx (1611 .. 1613) {
|
||||||
|
foreach $var (@array) {
|
||||||
|
$string = $pre . $xx . $var . $post;
|
||||||
|
$out = `curl $string`;
|
||||||
|
if ($out =~ /We're sorry, we were not able to find that passage./) {
|
||||||
|
last
|
||||||
|
} else {
|
||||||
|
print OUT "\$xx: $xx. \$string: $string.";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
close OUT;
|
|
@ -0,0 +1,38 @@
|
||||||
|
use 5.18.0;
|
||||||
|
use File::Slurp;
|
||||||
|
use File::Find ;
|
||||||
|
use Cwd ;
|
||||||
|
|
||||||
|
my $topDir = "/Users/Henry/Documents/WACS/en_ulb";
|
||||||
|
|
||||||
|
my %found;
|
||||||
|
my @filesToRun = ();
|
||||||
|
my $filePattern = '*.usfm' ;
|
||||||
|
|
||||||
|
|
||||||
|
open LOG, ">/Users/Henry/Google Drive/WA/Scripts/out/log.log" or die;
|
||||||
|
open OUT, ">/Users/Henry/Google Drive/WA/Scripts/out/output.txt" or die;
|
||||||
|
|
||||||
|
find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ;
|
||||||
|
|
||||||
|
foreach my $file ( @filesToRun )
|
||||||
|
{
|
||||||
|
print "$file\n" ;
|
||||||
|
my $fileText = read_file("$file", binmode => 'utf8');
|
||||||
|
$fileText =~ s/\n/ /g;
|
||||||
|
while ($fileText =~ /(\\[^ ]*) /g) {
|
||||||
|
my $code = $1;
|
||||||
|
unless (exists $found{$code}) {
|
||||||
|
$found{$code} = $code
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach my $code (sort keys %found) {
|
||||||
|
say OUT $code
|
||||||
|
}
|
||||||
|
|
||||||
|
close OUT;
|
||||||
|
close LOG;
|
||||||
|
|
||||||
|
print "\n\tDone."
|
Loading…
Reference in New Issue