Now needed for tagging

This commit is contained in:
Henry Whitney 2020-07-28 17:34:00 -04:00
parent 13b8c8c948
commit 91d789da65
8 changed files with 1315 additions and 0 deletions

180
Build_OL_files_from_XML.pl Normal file
View File

@ -0,0 +1,180 @@
# Builds easily searchable file from current OGNT and MAST-HB XML file
# Takes verse at a time from slurped file
# Useful for Mine routine building MAST PDF
use 5.18.0;
use File::Slurp;
use File::Find ;
use Cwd ;
use utf8;
#use open IN => ":utf8", OUT => ":utf8";
use open IO => ":utf8";
open LOG, ">:utf8", "Logs/log.txt" or die;
open OUT, ">:utf8", "Output/Original_languages.txt" or die;
my (@folders) = ("/Users/Henry/Documents/WACS/MAST_HB", "/Users/Henry/Documents/WACS/OGNT");
my (%order, %long);
my $outText;
while (<DATA>) {
chomp;
if (/^([^\t]*)\t([^\t]*)\t(.*)$/) {
$order{$1} = $3;
$long{$2} = $3;
}
}
#foreach my $key (sort keys %long) {
# say LOG $key . "\t" . $long{$key};
#}
foreach my $folder (@folders) {
say LOG "$folder";
#system "cd $folder;xml val *.xml;echo 'Continue? (Control + C to quit, Enter to continue)';read name;";
my ($topDir, $lang) = ($folder, "H");
if ($folder =~ /OGNT/) {$lang = "G"}
my @filesToRun = ();
my $filePattern = '*.xml' ;
find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ;
@filesToRun = sort @filesToRun;
foreach my $file ( @filesToRun ) {
say LOG $file;
my $fileText = read_file("$file", binmode => 'utf8');
my ($bk, $ch, $vs, $lemma, $word, $nbk, $nch, $nvs, $previous, $current, $interruption, $next, $verse, $thisBookText, $prevVsText, $holdText, $thisVsText, $nextVsText, $oldHold, $shortCur, $shortIntr);
while ($fileText =~ /<verse osisID="((.*?)\.(\d+)\.(\d+))"(\n|.)*?<\/verse>/spg) {
$verse = $&;
say LOG "\$1: $1, \$2: $2, \$3: $3, \$4: $4";
($shortCur, $bk, $ch, $vs) = ($1, $long{$2}, $3, $4);
$previous = $current;
$current = "$bk $ch:$vs";
say LOG "<0>\t\$current: $current";
my $verseText;
if ($verse =~ /<note>KJV:(([^\.]*)\.([^\.]*).([^<]*))<\/note>/p) { # Occurs only in OT
say LOG "<1>\t$&";
($shortIntr, $nbk, $nch, $nvs) = ($1, $long{$2}, $3, $4);
$interruption = "$nbk $nch:$nvs";
say LOG "<2>\t\$interruption: $interruption (of $current)";
if ($verse =~ /<verse osisID="$shortCur">\n[^<\n]*<note>KJV:$shortIntr<\/note>/) { # Complete renumber of verse
say LOG "<3>\t$&";
$current = $interruption;
$verseText = GetContent($verse);
$verseText = "$current\t$oldHold$verseText";
$oldHold = "";
}
elsif ($interruption ne $current && $verse =~ /<note>KJV:([^\.]*)\.([^\.]*).([^<]*)<\/note>/p) { # New verse begins here
say LOG "<4>\t$&";
($thisVsText, $nextVsText) = (${^PREMATCH}, ${^POSTMATCH});
$thisVsText = GetContent($thisVsText);
$nextVsText = GetContent($nextVsText);
$outText .= "$oldHold\n$current\t$thisVsText ";
$oldHold = "$nextVsText ";
}
elsif ($interruption eq $current && $verse =~ /<note>KJV:([^\.]*)\.([^\.]*).([^<]*)<\/note>/p) { # Previous verse continues here
say LOG "<5>\t$&";
($prevVsText, $thisVsText) = (${^PREMATCH}, ${^POSTMATCH});
$prevVsText = GetContent($prevVsText);
$thisVsText = GetContent($thisVsText);
$verseText .= "$oldHold\n$current\t$thisVsText";
$oldHold = "";
}
}
else {
# The whole verse should be processed in one piece
#$verseText = GetContent($verse);
#$verseText = "$current\$tverseText"
}
#$thisBookText .= "\n$verseText";
#$oldHold = $holdText
}
#$thisBookText =~ s/</<$lang/g;
#$outText .= "$thisBookText\n";
}
}
say OUT $outText;
close OUT;
close LOG;
print "\n\tDone.";
sub GetContent {
my ($text, $returnText) = ($_[0], "");
while ($text =~ /<w lemma="([^"]*)"[^>]*>([^<]*)<\/w>/) {
my ($lemma, $OL) = ($1, $2);
$lemma =~ s/[^\d"]*(\d+)[^\d"]*/$1/;
$returnText .= "$OL <$lemma> "
}
return $returnText
}
__DATA__
01 gen Genesis
02 exo Exodus
03 lev Leviticus
04 num Numbers
05 deu Deuteronomy
06 jos Joshua
07 jdg Judges
08 rut Ruth
09 1sa 1 Samuel
10 2sa 2 Samuel
11 1ki 1 Kings
12 2ki 2 Kings
13 1ch 1 Chronicles
14 2ch 2 Chronicles
15 ezr Ezra
16 neh Nehemiah
17 est Esther
18 job Job
19 psa Psalms
20 pro Proverbs
21 ecc Ecclesiastes
22 sng Song of Solomon
23 isa Isaiah
24 jer Jeremiah
25 lam Lamentations
26 ezk Ezekiel
27 dan Daniel
28 hos Hosea
29 jol Joel
30 amo Amos
31 oba Obadiah
32 jon Jonah
33 mic Micah
34 nam Nahum
35 hab Habakkuk
36 zep Zephaniah
37 hag Haggai
38 zec Zechariah
39 mal Malachi
41 mat Matthew
42 mrk Mark
43 luk Luke
44 jhn John
45 act Acts
46 rom Romans
47 1co 1 Corinthians
48 2co 2 Corinthians
49 gal Galatians
50 eph Ephesians
51 php Philippians
52 col Colossians
53 1th 1 Thessalonians
54 2th 2 Thessalonians
55 1ti 1 Timothy
56 2ti 2 Timothy
57 tit Titus
58 phm Philemon
59 heb Hebrews
60 jas James
61 1pe 1 Peter
62 2pe 2 Peter
63 1jn 1 John
64 2jn 2 John
65 3jn 3 John
66 jud Jude
67 rev Revelation

View File

@ -0,0 +1,126 @@
# Creates workable ULB.xml file that has all USFM markers in place.
use 5.18.0;
use File::Slurp;
use File::Find ;
use Cwd ;
use utf8;
#use open IN => ":utf8", OUT => ":utf8";
use open IO => ":utf8";
open(LOG, ">Logs/Log.txt") or die "$!";
open(OUT, ">/Users/Henry/Documents/WACS/en_ulb_tagged/ULB_xml/ULB.xml") or die "$!";
say OUT "<xml>";
my ($topDir, $outDir) = ("/Users/Henry/Documents/WACS/en_ulb", "/Users/Henry/Documents/WACS/en_ulb_tagged/ULB_xml");
my @filesToRun = ();
my $filePattern = '\.usfm' ;
#my $filePattern = '67-REV\.usfm' ;
my $file;
find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ;
@filesToRun = sort @filesToRun;
ReadFiles();
say OUT "</xml>";
close OUT;
close LOG;
say "\nDone.";
# =====
sub ReadFiles {
foreach $file ( @filesToRun ) {
say $file;
my @array;
my $fileText = read_file("$file", binmode => 'utf8');
$fileText =~ s/[ \n]+$//;
say LOG "|$fileText|";
#Delete \n
my ($book, $chap, $vers, $chapStart);
if ($fileText =~ /\\h ([^\n]*)/) {
$book = $1
}
#say LOG $book;
$fileText =~ s/\n/ /g;
$fileText =~ s/ / /g;
$fileText =~ s/\\s5/\n$&/g;
$fileText =~ s/\\v/√/g;
while ($fileText =~ s/(√[^√\n]*)(√)/$1\n$2/) {}
$fileText =~ s/√/\\v/g;
$fileText =~ s/(\\id[^\n]*)\n/\t\t<heading>$1<\/heading>\n/;
$fileText =~ s/ +\n/\n/g;
$fileText =~ s/(\\(q\d?|pi?|m|n?b))\n/\n$1 /g;
#say LOG $fileText;
@array = split /\n/, $fileText;
$fileText = "";
foreach my $line (@array) {
chomp;
if ($line =~ /<book name="(.*?)">/) {$book = $1;}
if ($line =~ /\\c (\d+).* \\v (\d+)/) {
($chap, $vers) = ($1, $2);
$line = "\t\t<chapter name=\"$book $chap\">\n\t\t\t<verse name=\"$book $chap:$vers\">$line</verse>";
$line = "\t\t</chapter>\n$line" if $chapStart;
$chapStart = 1;
}
elsif ($line =~ /\\v (\d+)/) {
$vers = $1;
$line = "\t\t\t<verse name=\"$book $chap:$vers\">$line</verse>"
}
#say LOG "===\n<AA>\n$line";
$line =~ s/(<verse[^>]*>)(.*\\v \d+ )(.*)(<\/verse>)/$1\n\t\t\t\t<preText>$2<\/preText>\n\t\t\t\t<text>$3<\/text>\n\t\t\t$4/s;
#say LOG "===\n<BB>\n$line";
if ($line =~ /<text>.*<\/text>/p) {
say LOG "<-0>\t$line";
my ($pre, $match, $post) = (${^PREMATCH}, ${^MATCH}, ${^POSTMATCH});
#say LOG "<-1>\t\$pre: $pre,\n\$match: $match,\n\$post: $post";
$match = TagInternalUSFM ($match);
$line = $pre . $match . $post;
}
say LOG "---\n<CC>\n$line\n===";
$line =~ s# +</#</#g;
$fileText .= $line . "\n";
}
say OUT "\t<book name=\"$book\">\n$fileText\t\t</chapter>\n\t</book>";
}
}
sub TagInternalUSFM {
my ($line, $placeNum) = ($_[0], 1);
my %places;
#say LOG "Tagging internal USFM in \$line $line.";
while ($line =~ /(<text>.*)(\\f .*?\\f\*)(.*<\/text>)/g) {
#say LOG "<+1>\t$2";
$line =~ s/(<text>.*)(\\f .*?\\f\*)(.*<\/text>)/$1<place number="$placeNum"\/>$3/;
$places{$placeNum} = $2;
$placeNum ++;
}
#say LOG "<+2>\t$line";
while ($line =~ /(<text>.*)(\\qs .*?\\qs\*)(.*<\/text>)/g) {
#say LOG "<+3>\t$2";
$line =~ s/(<text>.*)(\\qs .*?\\qs\*)(.*<\/text>)/$1<place number="$placeNum"\/>$3/;
$places{$placeNum} = $2;
$placeNum ++;
}
#say LOG "<+4>\t$line";
while ($line =~ /(<text>.*)(\\([bm]|pi?|q\d?|s2))( .*<\/text>)/g) {
#say LOG "<+5>\t$2";
$line =~ s/(<text>.*)(\\([bm]|pi?|q\d?|s2))(.*<\/text>)/$1<place number="$placeNum"\/>$4/;
$places{$placeNum} = $2;
$placeNum ++;
}
#say LOG "<+6>\t$line";
$line =~ s/ / /g;
#say LOG "<+7>\t$line";
foreach my $place (sort keys %places) {
#say LOG "<+8>\tReplacing <place number=\"$place\"\/> with <usfm>$places{$place}<\/usfm> in\n$line.";
unless ($line =~ s/<place number="$place"\/>/<usfm>$places{$place}<\/usfm>/) {die}
}
say LOG "<+9>\t$line";
return $line;
}

113
Check_ULB.pl Normal file
View File

@ -0,0 +1,113 @@
# Checks ULB in ULB.xml against tagged ULB
use 5.18.0;
use File::Slurp;
use File::Find ;
use Cwd ;
use utf8;
#use open IN => ":utf8", OUT => ":utf8";
use open IO => ":utf8";
open(LOG, ">Logs/Log.txt") or die "$!";
my ($ULBxml, $taggedULBDir) = ("/Users/Henry/Documents/WACS/Tips_and_Hacks/Tagged_OGNT/ULB_xml/ULB.xml", "/Users/Henry/Documents/WACS/Tips_and_Hacks/Tagged_OGNT/Manual_Tagging");
my (@filesToRun) = ();
my %fullName;
my $filePattern = "\.xml" ;
my $file;
my $xmlText = read_file("$ULBxml", binmode => 'utf8');
GetBooksToCheck();
Compare();
sub Compare {
foreach my $file (@filesToRun) {
say LOG "|$file|, |$taggedULBDir/$file.xml|";
my $taggedText = read_file("$taggedULBDir/$file.xml", binmode => 'utf8');
GetGist($file, $taggedText);
}
}
sub GetGist {
my ($fileName, $wholeTaggedText) = @_;
my ($verseRef, $standard, $tagged);
say LOG "|$fileName|, |$fullName{$fileName}|";
while ($wholeTaggedText =~ /<verse name="($fullName{$fileName} \d+:\d+)">((.|\n)*?)<preText>(.*?)<\/preText>((.|\n)*?)\n\t+((<w ((.|\n)*?)\n)*)\t+<\/verse>/sg) {
my ($preText, $gist) = ($4, $7);
$verseRef = $1;
say LOG "\$verseRef: |$verseRef|";
if ($xmlText =~ /<verse name="$verseRef">\n\t+<preText>([^\n]*)<\/preText>\n\t+<text>([^\n]*)<\/text>\n\t+<\/verse>/s) {
my ($standardPT, $standardT) = ($1, $2);
($tagged) = Untag($preText, $gist);
#say LOG $tagged;
$standard = $standardPT . " " . $standardT;
$standard =~ s/<[^<>]*>//g;
$standard =~ s/ {2,}/ /g;
$standard =~ s/ +$//;
if ($standard ne $tagged) {
say LOG "\nMISMATCH:\n\$standard\n$standard\n\$tagged\n$tagged\n"
}
}
}
}
sub Untag {
my ($pre, $txt) = ($_[0], $_[1]);
#say LOG "\$pre: $pre\n\$txt: $txt";
$txt =~ s/[\t\n]/ /g;
$txt =~ s/(√|<[^<>]*>)//g;
$txt = $pre . " " . $txt;
$txt =~ s/ {2,}/ /g;
$txt =~ s/ +$//;
return $txt;
}
sub GetBooksToCheck {
while (<DATA>) {
chomp;
unless (/^#/) {
if (/([^\t]*)\t([^\t]*)/) {
my ($file, $book) = ($1, $2);
say "|$file|";
push @filesToRun, "$file";
$fullName{$file} = $book;
}
}
}
}
close LOG;
say "\nDone.";
# =====
__DATA__
#41-MAT Matthew
#42-MRK Mark
#43-LUK Luke
#44-JHN John
#45-ACT Acts
#46-ROM Romans
#47-1CO 1 Corinthians
#48-2CO 2 Corinthians
#49-GAL Galatians
#50-EPH Ephesians
#51-PHP Philippians
#52-COL Colossians
#53-1TH 1 Thessalonians
#54-2TH 2 Thessalonians
#55-1TI 1 Timothy
#56-2TI 2 Timothy
57-TIT Titus
#58-PHM Philemon
#59-HEB Hebrews
#60-JAS James
#61-1PE 1 Peter
#62-2PE 2 Peter
#63-1JN 1 John
#64-2JN 2 John
#65-3JN 3 John
#66-JUD Jude
#67-REV Revelation

View File

@ -0,0 +1,410 @@
# Takes current tW entries and populates tagged OGNT XML
#
# This is the current best version
# Requires ULB that includes USFMs.
use 5.12.0;
use File::Slurp;
use File::Find ;
use Cwd ;
use utf8;
#use open IN => ":utf8", OUT => ":utf8";
use open IO => ":utf8";
$" = "\n";
mkdir "Logs";
open(LOG, ">Logs/tW_pairs.txt") or die "$!";
my $ULBfile = "/Users/Henry/Documents/WACS/en_ulb_tagged/ULB_xml/ULB.xml";
my $topDirOGNT = "/Users/Henry/Documents/WACS/OGNT";
#my $topDirOGNT = "/Users/Henry/Documents/WACS/en_ulb_tagged/Tag_test";
my $topDirtW = "/Users/Henry/Documents/WACS/en_tw/bible";
my ($outDir, $outFile) = ("/Users/Henry/Documents/WACS/en_ulb_tagged/Auto-tagged", "");
my ($ULBText, $workText, $language);
my ($file);
my (%ULBtextThisVerse, %ULBpreTextThisVerse, %SNsThisVerse, %entriesThisSN, %longName);
my @OGNTfilesToRun = ();
#my $filePattern = '\.xml' ;
my $filePattern = '52-COL\.xml' ;
find( sub { push @OGNTfilesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDirOGNT) ;
say LOG "\@OGNTfilesToRun:\n@OGNTfilesToRun\n";
my @tWfilesToRun = ();
$filePattern = '.md' ;
find( sub { push @tWfilesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDirtW) ;
Read_tW_Files();
close LOG;
open(LOG, ">Logs/Log.txt") or die "$!";
LongBookNames();
Prepare_ULB_file();
say LOG "Prepare_ULB_file done.\n\@OGNTfilesToRun:\n@OGNTfilesToRun\n";
ProcessXML();
# put unused SN at end of verse
close LOG;
say "\nDone.";
# =====
sub Read_tW_Files {
foreach $file ( @tWfilesToRun ) {
say LOG $file;
my (@sns);
my $entries;
my $fileText = read_file("$file", binmode => 'utf8');
if ($fileText =~ /\* Strong's: ([^\n]*)\n/) {
my $sns = $1;
#say LOG "\t$sns";
@sns = split /, /, $sns;
}
if ($fileText =~ /Forms Found in the English ULB:\n\n([^\n]*)\n/) {
$entries = $1;
die "$fileText" if $entries eq "";
#say LOG "\t\t$entries"
}
foreach my $sn (@sns) {
$entriesThisSN{$sn} .= $entries . ", ";
#say LOG "\t\t\t$sn: $entriesThisSN{$sn}"
}
}
foreach my $sn (sort keys %entriesThisSN) {
#say LOG "$sn: $entriesThisSN{$sn}";
my @entries = split /, /, $entriesThisSN{$sn};
@entries = reverse sort { substr($a,0,1) <=> substr($b,0,1)
|| length($a) <=> length($b)
|| $a <=> $b }
@entries;
$entriesThisSN{$sn} = "";
foreach my $slice (@entries) {
$entriesThisSN{$sn} .= "$slice, "
}
$entriesThisSN{$sn} =~ s/, $//;
say LOG "$sn: $entriesThisSN{$sn}";
}
}
sub LongBookNames {
while (<DATA>) {
chomp;
if (/([^\t]*)\t([^\t]*)\t([^\t]*)/) {
$longName{$2} = $3
}
}
}
sub ProcessXML {
foreach my $file (@OGNTfilesToRun) {
my $greekText;
my $fileGist;
if ($file =~ /((..)....\.xml)/) {
($fileGist, $language) = ($1, $2);
if ($language > 40) {
$language = "G"
} else {$language = "H"}
}
say LOG "<0>\t$file \t$fileGist";
open(OUT, ">$outDir/$fileGist") or die "$outDir/$fileGist: $!";
my ($pre, $gist, $post, $bk, $ch, $vs, $thisVerse, $staticText, $residueText, $matchedLines, $flag, $thisVerseForOutput,
$linesWithRelevantSNs, $linesNotMatched, $orderedOutputLines, $linesToSkip, $thisPreText);
open (my $thisFile, "<:utf8", "$file") or die "$file:\n$!";
my ($originalLinesCount, $rsnCount, $skipCount, $noRSNCount, $outCount);
while (my $thisLine = <$thisFile>) {
chomp $thisLine;
if ($thisLine =~ /<verse/) {say LOG "\n=========================="}
if ($thisLine =~ /<\/verse>/) {
say LOG "<0.1>\t$thisLine";
say LOG "<11>\n\$linesWithRelevantSNs\n$linesWithRelevantSNs\n\$linesToSkip\n$linesToSkip\$residueText\n$residueText";
($matchedLines, $residueText, $linesNotMatched) = ProcessRelevantSNs($linesWithRelevantSNs, $staticText, $residueText);
say LOG "<14>\t\$matchedLines\n$matchedLines\n\$linesNotMatched\n$linesNotMatched";
my %orderedLine;
$matchedLines =~ s/\n{2,}/\n/gs;
say LOG "<15\tBefore sort of \$matchedLines:\n$matchedLines\n";
while ($matchedLines =~ /([^◊]*)◊(\d*)\n/g) {
$orderedLine{$2} = $1;
say LOG "<5>\t\$2: $2\t\$1: $1";
}
$matchedLines = "";
foreach my $line (sort {$a <=> $b} keys %orderedLine) {
say LOG "<5.5>\t\$line: $line\t\$orderedLine{$line}: $orderedLine{$line}";
$matchedLines .= "$orderedLine{$line}\n"
}
chomp $matchedLines;
say LOG "<16>\tAfter sort of \$matchedLines:\n$matchedLines\n\$linesNotMatched\n$linesNotMatched";
$residueText =~ s/<usfm>.*?<\/usfm>//g;
$residueText =~ s/(^q | q$)//g;
$residueText =~ s/ {3,}/ /g;
$residueText =~ s/^ +//;
$residueText =~ s/ +$/$1/;
$greekText =~ s/^ +//;
$greekText =~ s/ +$/$1/;
$staticText =~ s/^ +//;
$staticText =~ s/ +$/$1/;
my $internalUSFM;
$internalUSFM .= "\t\t\t\t\t$&\n" while ($staticText =~ /<usfm>.*?<\/usfm>/g);
$linesNotMatched =~ s/\n+$//;
$linesToSkip =~ s/\n+$//;
$matchedLines =~ s/^\n+//;
$internalUSFM =~ s/\n+$//;
say LOG "<17>\tAfter pruning \$matchedLines:\n$matchedLines\n\$linesNotMatched\n$linesNotMatched";
say OUT "\t\t\t\t\t<Greek>$greekText</Greek>";
say OUT "\t\t\t\t\t<preText>$thisPreText</preText>";
say OUT "\t\t\t\t\t<ULB>$staticText</ULB>";
say OUT "\t\t\t\t\t<residue>$residueText</residue>";
say OUT "$matchedLines" unless ($matchedLines eq "");
say OUT "$linesNotMatched" unless ($linesNotMatched eq "");
say OUT "$internalUSFM" unless ($internalUSFM eq "");
say OUT "$linesToSkip" if ($linesToSkip);
say OUT "$thisLine";
($originalLinesCount, $rsnCount, $skipCount, $noRSNCount, $outCount) = ();
($thisVerseForOutput, $flag, $workText, $greekText, $linesNotMatched, $linesToSkip, $residueText, $orderedOutputLines, $linesWithRelevantSNs) = ();
($linesToSkip) = ("");
}
elsif ($thisLine =~ /<w /) {
say LOG "<0.2>\t$thisLine";
$originalLinesCount ++;
if ($thisLine =~ />([^\n<>]*)</) {
$greekText .= $1 . " "
}
$thisLine =~ s/(<w .*)>([^<]*)(<\/w>)/$1 text="$2">$3/;
if ($thisLine =~ /lemma="(\d+)"/) {
my $thisLemma = $language . $1;
if (exists $entriesThisSN{$thisLemma}) {
$rsnCount ++;
$linesWithRelevantSNs .= $thisLine . "\n";
say LOG "<0.2.1>\t\$thisLemma: $thisLemma; line pushed to \$linesWithRelevantSNs";
}
else {
$skipCount ++;
$thisLine =~ s/><\/w>/>√<\/w>/;
$linesToSkip .= "$thisLine\n";
#say LOG "<0.2.2>\t\$thisLemma: $thisLemma; line pushed to \@LinesToSkip";
}
}
}
elsif ($thisLine =~ /<verse osisID="(.*?)\.(.*?)\.(.*?)">/) {
say LOG "<0.3>\t$thisLine";
($bk, $ch, $vs) = ($1,$2,$3);
($thisVerse, $greekText) = ("$longName{$bk} $ch:$vs", "");
$staticText = $ULBtextThisVerse{$thisVerse};
$residueText = "q $staticText q";
$thisPreText = $ULBpreTextThisVerse{$thisVerse};
say OUT "\t\t\t\t<verse name=\"$thisVerse\">";
($flag) = (1);
}
else {say OUT $thisLine}
}
close $thisFile;
close OUT;
}
}
sub ProcessRelevantSNs {
my ($relevantLines, $staticText, $residueText, $linesNotMatched) = (@_);
my ($matchedLines, $thisLine);
my @relevantLines = split /\n/, $relevantLines;
foreach my $line (@relevantLines) {
if ($line =~ /lemma="(\d+)"/) {
my $thisSN = $language . $1;
say LOG "<12>\t\$line: $line, \$thisSN: $thisSN, \$entriesThisSN{$thisSN}\n$entriesThisSN{$thisSN}";
($thisLine, $residueText, $linesNotMatched) = MatchAndPlace($line, $thisSN, $staticText, $residueText, $linesNotMatched);
$thisLine =~ s/[ \t]+$//;
$matchedLines .= $thisLine . "\n";
$matchedLines =~ s/\n{2,}$/\n/s;
say LOG "<13>\t\$matchedLines\n$matchedLines\n\$linesNotMatched\n$linesNotMatched+++"
}
}
return ($matchedLines, $residueText, $linesNotMatched);
}
sub MatchAndPlace {
my ($line, $sn, $staticText, $workText, $linesNotMatched) = @_;
#say LOG "<8>\t\$line: $line \$sn: $sn \$workText\n$workText";
my ($workEntry, $found, $matchedLines, $first, $second, $third, $firstLen, $secondLen, $thirdLen);
my @entries = split /, /, $entriesThisSN{$sn};
foreach my $entry (@entries) {
my $entryType;
if ($entry =~ /^(.*) \.\.\. (.*) \.\.\. (.*)$/) {
($first, $second, $third) = ($1, $2, $3);
($firstLen, $secondLen, $thirdLen) = (length $first, length $second, length $third);
$workEntry = "\\b" . $first . "\\b" . ".*?" . "\\b" . $second . "\\b" . ".*?" . "\\b" . $third;
say LOG "<1a>\t\$first: $first, \$second: $second, \$third: $third, \$firstLen: $firstLen, \$secondLen,: $secondLen, \$thirdLen: $thirdLen \$entry: |$entry|\t\$workEntry: |$workEntry|";
$entryType = 1;
}
elsif ($entry =~ /^(.*) \.\.\. (.*)$/) {
($first, $second) = ($1, $2);
($firstLen, $secondLen) = (length $first, length $second);
$workEntry = "\\b" . $first . "\\b" . ".*?" . "\\b" . $second . "\\b";
say LOG "<2a>\t\$first: $first, \$second: $second, \$third: $third, \$firstLen: $firstLen, \$secondLen,: $secondLen, \$entry: |$entry|\t\$workEntry: |$workEntry|";
$entryType = 2;
}
else {$workEntry = $entry;}
my $foundText;
#say LOG "<8.1>\t\$entryType: $entryType\t\$entry: $entry\t\$workEntry: $workEntry";
if ($workText =~ /\b$workEntry\b/p) {
say LOG "<8.1>Found: \t\$entryType: $entryType\t\$entry: $entry\t\$workEntry: $workEntry";
($foundText, $workText) = ($&, "${^PREMATCH}ı${^POSTMATCH}");
my ($place, $foundTextLength, $replacementSpaces) = (length ${^PREMATCH}, length $foundText, "");
$line =~ s/></>$entry</;
while (length $replacementSpaces < $foundTextLength) {$replacementSpaces .= " "}
if ($entryType) {
say LOG "<8.2>\n\$workText,: $workText, \$matchedLines:\n$matchedLines ";
($workText) = FixWorkText($line, $workText, $workEntry, $foundText, $foundTextLength, $first, $firstLen, $second, $secondLen, $third, $thirdLen);
say LOG "<8.3>\n\$workText:\n$workText\n\$matchedLines:\n$matchedLines";
}
else {$workText =~ s/ı/$replacementSpaces/;}
$matchedLines .= "$line◊$place";
say LOG "<8.4>\tAfter found, new \$workText:\n$workText";
$found = 1;
}
else {
#say LOG "\$workEntry $workEntry not found"
}
if ($found) {
last
}
}
unless ($found) {
$line =~ s/></>?</;
$linesNotMatched .= "$line\n"
}
return ($matchedLines, $workText, $linesNotMatched)
}
sub FixWorkText {
my ($thisLine, $text, $entry, $foundText, $foundTextLength, $first, $firstLen, $second, $secondLen, $third, $thirdLen) = @_;
my ($firstSpace, $secondSpace, $thirdSpace);
while (length $firstSpace < $firstLen) {$firstSpace .= " "}
while (length $secondSpace < $secondLen) {$secondSpace .= " "}
while (length $thirdSpace < $thirdLen) {$thirdSpace .= " "}
say LOG "<9>\$text:\n$text\n\t\t\$entry: $entry \$foundText: $foundText\t \$foundTextLength: $foundTextLength\t\$first: $first\t\$second: $second\t\$third: $third\n\$firstSpace: $firstSpace\t\$secondSpace: $secondSpace\t\$thirdSpace: $thirdSpace";
if ($third) {
if ($foundText =~ /$first(.*)$second(.*)$third/) {
my ($firstGap, $secondGap) = ($1, $2);
my $repText = "$firstSpace$firstGap$secondSpace$secondGap$thirdSpace";
say LOG "<9.1> \$repText: $repText";
$text =~ s/ı/$repText/;
}
}
else {
if ($foundText =~ /$first(.*)$second/) {
my ($firstGap) = ($1);
say LOG "<9.2>\t\$firstSpace: |$firstSpace|\t\$firstGap: |$firstGap|\t\$secondSpace: |$secondSpace|";
my $repText ="$firstSpace$firstGap$secondSpace";
say LOG "<9.3> \$repText: |$repText|";
$text =~ s/ı/$repText/;
}
}
return ($text)
}
sub Prepare_ULB_file {
my $thisVerse;
#$ULBText = read_file($ULBfile, binmode => 'utf8');
#while ($ULBText =~ /<verse name="(.*?)">\n<preText>(.*?)<\/preText>\n.*<text>(.*?)<\/text>.*<\/verse>/sg) {
# ($ULBtextThisVerse{$1}, $ULBpreTextThisVerse{$1}) = ($3, $2);
#}
#
open (my $file, "<:utf8", "$ULBfile") or die "$ULBfile:\n$!";
while (my $thisLine = <$file>) {
chomp $thisLine;
if ($thisLine =~ /verse name="(.*?)"/) {
$thisVerse = $1;
#say LOG "$thisVerse:\n$thisLine"
}
elsif ($thisLine =~ /<preText>(.*?)<\/preText>/) {
$ULBpreTextThisVerse{$thisVerse} = $1;
#say LOG "$thisVerse:\n$ULBpreTextThisVerse{$thisVerse}"
}
elsif ($thisLine =~ /<text>(.*?)<\/text>/) {
$ULBtextThisVerse{$thisVerse} = $1;
#say LOG "$thisVerse:\n$ULBtextThisVerse{$thisVerse}"
}
}
close $file;
}
__DATA__
01 gen Genesis
02 exo Exodus
03 lev Leviticus
04 num Numbers
05 deu Deuteronomy
06 jos Joshua
07 jdg Judges
08 rut Ruth
09 1sa 1 Samuel
10 2sa 2 Samuel
11 1ki 1 Kings
12 2ki 2 Kings
13 1ch 1 Chronicles
14 2ch 2 Chronicles
15 ezr Ezra
16 neh Nehemiah
17 est Esther
18 job Job
19 psa Psalms
20 pro Proverbs
21 ecc Ecclesiastes
22 sng Song of Solomon
23 isa Isaiah
24 jer Jeremiah
25 lam Lamentations
26 ezk Ezekiel
27 dan Daniel
28 hos Hosea
29 jol Joel
30 amo Amos
31 oba Obadiah
32 jon Jonah
33 mic Micah
34 nam Nahum
35 hab Habakkuk
36 zep Zephaniah
37 hag Haggai
38 zec Zechariah
39 mal Malachi
41 mat Matthew
42 mrk Mark
43 luk Luke
44 jhn John
45 act Acts
46 rom Romans
47 1co 1 Corinthians
48 2co 2 Corinthians
49 gal Galatians
50 eph Ephesians
51 php Philippians
52 col Colossians
53 1th 1 Thessalonians
54 2th 2 Thessalonians
55 1ti 1 Timothy
56 2ti 2 Timothy
57 tit Titus
58 phm Philemon
59 heb Hebrews
60 jas James
61 1pe 1 Peter
62 2pe 2 Peter
63 1jn 1 John
64 2jn 2 John
65 3jn 3 John
66 jud Jude
67 rev Revelation

View File

@ -0,0 +1,59 @@
use 5.12.0;
use File::Slurp;
use File::Find ;
use Cwd ;
my %location;
open LOG, ">log/log.log" or die;
open(IN, "/Users/Henry/Documents/WACS/Tips_and_Hacks/MAST_tW_PDF_Updater/FilesForUpdates/Temp/ULB_text.txt") or die "$!";
say "Reading ULB";
while (<IN>) {
#print LOG "$_";
chomp;
while (s/^([^\n\t]*)\t([^\n]*?)([A-Z][a-z]+(-[A-Z][a-z]+)?)/$1\t$2/) {
# say LOG $3;
unless (exists $location{$3}) {$location{$3} = $1}
}
}
close IN;
say "Outputting hash";
open(OUT, ">out/results.txt") or die "$!";
foreach my $word (sort keys %location) {
say OUT "$word, $location{$word}";
}
close OUT;
say "Deleting common words";
my $fileText = read_file("/Users/Henry/Documents/WACS/Tips_and_Hacks/MAST_tW_PDF_Updater/FilesForUpdates/Temp/ULB_text.txt", binmode => 'utf8');
foreach my $word (sort keys %location) {
my $temp = lc $word;
#say LOG $word . "\t" . $temp;
if ($fileText =~ /\b$temp\b/) {
delete $location{$word}
}
}
say "Outputting final product";
open(OUT, ">out/results.txt") or die "$!";
foreach my $word (sort keys %location) {
say OUT "$word, $location{$word}";
}
close OUT;
close LOG;

View File

@ -0,0 +1,365 @@
use 5.12.0;
use File::Slurp;
use File::Find ;
use Cwd ;
my %value;
open LOG, ">log/log.log" or die;
while (<DATA>) {
chomp;
if (/^(.*)$/) {
$value{$1} = $1
}
}
my $fileText = read_file("/Users/Henry/Documents/WACS/Tips_and_Hacks/MAST_tW_PDF_Updater/FilesForUpdates/Temp/ULB_text.txt", binmode => 'utf8');
$fileText = "\n" . $fileText;
foreach my $thisWord (sort keys %value) {
print $thisWord . "\n";
if ($fileText =~ /\n([^\n\t]*)\t[^\n]*$thisWord\b/) {
say LOG $thisWord . ", " . $1;
}
}
close LOG;
__DATA__
Aaron
Abel
Abiathar
Abijah
Abimelek
Abner
Abraham
Absalom
Adam
Adonijah
Ahab
Ahaz
Ahaziah
Ahijah
Ai
Amalek
Amaziah
Ammon
Amnon
Amorite
Amos
Amoz
Andrew
Annas
Antioch
Apollos
Aquila
Arabah
Arabia
Aram
Ararat
Artaxerxes
Asa
Asaph
Ashdod
Asher
Asherah
Ashkelon
Asia
Assyria
Athaliah
Azariah
Baal
Baasha
Babel
Babylon
Balaam
Barabbas
Barnabas
Bartholomew
Baruch
Bashan
Bathsheba
Beelzebul
Beersheba
Benaiah
Benjamin
Berea
Beth Shemesh
Bethany
Bethel
Bethlehem
Bethuel
Boaz
Caesar
Caesarea
Caiaphas
Cain
Caleb
Cana
Canaan
Capernaum
Carmel
Chaldea
Cilicia
Colossae
Corinth
Cornelius
Crete
Cush
Cyprus
Cyrene
Cyrus
Damascus
Dan
Daniel
Darius
David
Delilah
Eden
Edom
Egypt
Ekron
Elam
Eleazar
Eliakim
Elijah
Elisha
Elizabeth
En Gedi
Enoch
Ephesus
Ephraim
Ephrath
Esau
Esther
Ethiopia
Euphrates River
Eve
Ezekiel
Ezra
Gabriel
Gad
Galatia
Galilee
Gath
Gaza
Gerar
Geshur
Gethsemane
Gibeah
Gibeon
Gideon
Gilead
Gilgal
Girgashites
Golgotha
Goliath
Gomorrah
Goshen
Greece
Greek
Habakkuk
Hagar
Haggai
Ham
Hamath
Hamor
Hananiah
Hannah
Haran
Hebron
Hermon
Herod
Herodias
Hezekiah
Hilkiah
Hittite
Hivite
Horeb
Hosea
Hoshea
Iconium
Isaac
Isaiah
Ishmael
Issachar
Jacob
James
Japheth
Jebus
Jehoiachin
Jehoiada
Jehoiakim
Jehoram
Jehoshaphat
Jehu
Jephthah
Jeremiah
Jericho
Jeroboam
Jerusalem
Jesse
Jethro
Jezebel
Jezreel
Joab
Joash
Job
Joel
John
John Mark
Jonah
Jonathan
Joppa
Joram
Jordan River
Joseph
Joshua
Josiah
Jotham
Judah
Judas.*Iscariot
Judas son of James
Judea
Kadesh
Kedar
Kedesh
Kerethites
Kidron Valley
Korah
Laban
Lamech
Lazarus
Leah
Lebanon
Levi
Leviathan
Lot
Luke
Lystra
Maakah
Macedonia
Maker
Malachi
Manasseh
Martha
Mary
Mary
Mary.*Magdalene
Matthew
Mede
Melchizedek
Memphis
Meshech
Mesopotamia
Micah
Michael
Midian
Miriam
Mishael
Mizpah
Moab
Molech
Mordecai
Moses
Mount of Olives
Naaman
Nahor
Nahum
Naphtali
Nathan
Nazareth
Nebuchadnezzar
Negev
Nehemiah
Nile River
Nineveh
Noah
Obadiah
Omri
Paddan Aram
Paran
Paul
Peor
Perizzite
Persia
Peter
Pharaoh
Philip
Philippi
Philistia
Philistines
Phinehas
Phoenicia
Pilate
Pontus
Potiphar
Priscilla
Rabbah
Rachel
Rahab
Ramah
Ramoth
Rebekah
Rehoboam
Reuben
Rimmon
Rome
Ruth
Salt Sea
Samaria
Samson
Samuel
Sarah
Saul
Sea of Galilee
Sea of Reeds
Sennacherib
Seth
Sharon
Sheba
Shechem
Shem
Shiloh
Shimei
Shinar
Sidon
Silas
Simeon
Simon the Zealot
Sinai
Sodom
Solomon
Stephen
Sukkoth
Syria
Tamar
Tarshish
Tarsus
Terah
Thessalonica
Thomas
Timothy
Tirzah
Titus
Troas
Tubal
Tychicus
Tyre
Ur
Uriah
Uzziah
Vashti
Xerxes
Zacchaeus
Zadok
Zebedee
Zebulun
Zechariah
Zedekiah
Zephaniah
Zerubbabel
Zoar

24
Get_Strong_variants.pl Normal file
View File

@ -0,0 +1,24 @@
$pre = "https://biblehub.com/greek/";
$var = "a, b, c, d, e, f, g, h";
$post = ".htm";
@array = split (/, /, $var);
#$out = system "curl --fail https://biblehub.com/greek/2.htm";
#print "\n\n\t\$out: $out";
open OUT, ">out/results.txt";
foreach $xx (1611 .. 1613) {
foreach $var (@array) {
$string = $pre . $xx . $var . $post;
$out = `curl $string`;
if ($out =~ /We're sorry, we were not able to find that passage./) {
last
} else {
print OUT "\$xx: $xx. \$string: $string.";
}
}
}
close OUT;

38
Inventory_usfm_markers.pl Normal file
View File

@ -0,0 +1,38 @@
use 5.18.0;
use File::Slurp;
use File::Find ;
use Cwd ;
my $topDir = "/Users/Henry/Documents/WACS/en_ulb";
my %found;
my @filesToRun = ();
my $filePattern = '*.usfm' ;
open LOG, ">/Users/Henry/Google Drive/WA/Scripts/out/log.log" or die;
open OUT, ">/Users/Henry/Google Drive/WA/Scripts/out/output.txt" or die;
find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ;
foreach my $file ( @filesToRun )
{
print "$file\n" ;
my $fileText = read_file("$file", binmode => 'utf8');
$fileText =~ s/\n/ /g;
while ($fileText =~ /(\\[^ ]*) /g) {
my $code = $1;
unless (exists $found{$code}) {
$found{$code} = $code
}
}
}
foreach my $code (sort keys %found) {
say OUT $code
}
close OUT;
close LOG;
print "\n\tDone."