update for MAST PDF

This commit is contained in:
Henry Whitney 2020-07-07 17:39:54 -04:00
parent c94831a0b2
commit cb1679ac8f
6 changed files with 487 additions and 545 deletions

File diff suppressed because it is too large Load Diff

View File

@ -349,13 +349,13 @@ sub Finish {
$tW_files .= "$key "; $tW_files .= "$key ";
say LOG "\$key: $key\t\$tW_file{$key}: $tW_file{$key}" say LOG "\$key: $key\t\$tW_file{$key}: $tW_file{$key}"
} }
#say "\nOpening .md files."; say "\nOpening .md files\n\$tW_files: $tW_files";
#if ($^O eq "darwin") { if ($^O eq "darwin") {
# #system `$browser https://www.blueletterbible.org/lang/lexicon/lexicon.cfm?strongs=$strong`; #system `$browser https://www.blueletterbible.org/lang/lexicon/lexicon.cfm?strongs=$strong`;
# system "perl $Bin/get_strongs_gist.pl"; system "perl $Bin/get_strongs_gist.pl";
# system `$textEditor $tW_files`; system `$textEditor $tW_files`;
# system `$textEditor $exceptions_file`; system `$textEditor $exceptions_file`;
#} }
#if ($^O eq "linux") { #if ($^O eq "linux") {
# say "curl $intrln_ref > $Bin/Temp/This_interlinear.html"; # say "curl $intrln_ref > $Bin/Temp/This_interlinear.html";
# system "curl $intrln_ref > $Bin/Temp/This_interlinear.html"; # system "curl $intrln_ref > $Bin/Temp/This_interlinear.html";

View File

@ -1,45 +0,0 @@
use 5.12.0;
use File::Slurp;
use File::Find ;
use Cwd ;
use utf8;
#use open IN => ":utf8", OUT => ":utf8";
use open IO => ":utf8";
open(LOG, ">/Users/Henry/Documents/WACS/Tips_and_Hacks/MAST_tW_PDF_Updater/FilesForUpdates/Logs/log.txt") or die "$!";
my $topDir = "/Users/Henry/Documents/WACS/Restructure/bible/names";
my @filesToRun = ();
my $filePattern = '*.md' ;
find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ;
foreach my $file ( @filesToRun )
{
say LOG $file;
my $shortFile = $file;
$shortFile =~ s/^.*\/([^\/]*\.md)$/\/Users\/Henry\/Documents\/WACS\/Tips_and_Hacks\/MAST_tW_PDF_Updater\/FilesForUpdates\/Output\/names\/$1/;
my $fileText = read_file($file, binmode => 'utf8');
if ($fileText =~ /Forms Found in the English ULB/) { say LOG "\tForms Found in the English ULB"; }
else {
my ($nameLine, $mainName, $otherNames, $mainText);
if ($fileText =~ /^# ([^\n]*)\n(.*)$/s) {
($nameLine, $mainText) = ($1, $2);
say LOG "\$nameLine: $nameLine\n\$mainText:\n$mainText\n\n";
if ($nameLine =~ /^([^,]*), (.*)$/) {
($mainName, $otherNames) = ($1, $2);
} else {
$mainName = $nameLine
}
$fileText = "# $mainName\n\n$mainText\n\n## Forms Found in the English ULB:\n\n$nameLine";
$fileText =~ s/\n{3,}/\n\n/g;
open(OUT, ">$shortFile") or die "$!";
say OUT $fileText;
close OUT;
}
}
}
say "Done."

View File

@ -1,62 +0,0 @@
# Adds Synonyms and Related Words section and
# Forms Found in the English ULB section
# to tW pages
use 5.12.0;
use File::Slurp;
use File::Find ;
use Cwd ;
use utf8;
#use open IN => ":utf8", OUT => ":utf8";
use open IO => ":utf8";
open LOG, ">Logs/log.log";
my $topDir = "/Users/Henry/Documents/WACS/W_Q_Restructure/bible";
my $topOutDir = "/Users/Henry/Documents/WACS/W_Q_Restructure_new/bible";
my @filesToRun = ();
my $filePattern = '*.md' ;
find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ;
foreach my $file ( @filesToRun ) {
say $file;
my $fileText = read_file("$file", binmode => 'utf8');
my $outText = Process($fileText);
Output($file, $outText);
}
close LOG;
say "Done.";
# =====================
sub Process {
my $text = $_[0];
my ($entries, $keyWord, $bulk, $forms);
if ($text =~ /^# ([^\n]*)\n/) {
$entries = $1;
}
if ($text =~ /^# (([^\n,]*)(\n|,))/) {
$keyWord = $2
}
if ($text =~ /(## (Facts|Definition):.*)$/s) {
$bulk = $1
}
my @forms = split /, /, $entries;
@forms = sort @forms;
$forms = join(', ', @forms);
$text = "# $keyWord\n\n## Synonyms and Related Words:\n\n$forms\n\n$bulk\n\n## Forms Found in the English ULB\n\n$forms\n\n\n\n";
while ($text =~ s/\n{3,}/\n\n/g) {}
#$text =~ s/\n+$/\n/;
return $text;
}
sub Output {
my ($OutFile, $text) = ($_[0], $_[1]);
$OutFile =~ s/$topDir/$topOutDir/;
open(OUT, ">$OutFile") or die $!;
print OUT $text;
close OUT
}

View File

@ -11,9 +11,9 @@ Repository directory: /Users/Henry/Documents/WACS
translationNotes path: en_tn translationNotes path: en_tn
Unlocked Literal Bible path: en_ulb Unlocked Literal Bible path: en_ulb
# translationNotes path: gl_.*_tn # translationNotes path: en_tn
translationWords path: gl_.*_bible.en_tw translationWords path: en_tw/bible
# Unlocked Literal Bible path: gl_.*_ulb # Unlocked Literal Bible path: en_ulb
Hebrew Bible XML directory: MAST_HB Hebrew Bible XML directory: MAST_HB
Greek Bible XML directory: OGNT Greek Bible XML directory: OGNT

View File

@ -61,7 +61,7 @@ my (@fileList);
# ============================== # ==============================
chdir("$pwd"); chdir("$pwd");
open LOG, ">:utf8", "Logs${d}Exc_log.log" or die "\$log: Logs${d}Exc_log.log: $!"; open LOG, ">:utf8", "Logs${d}1_Data_and_inputs.txt" or die "\$log: Logs${d}1_Data_andinputs.txt: $!";
open OUT, ">:utf8", $output or die "$!"; open OUT, ">:utf8", $output or die "$!";
open MISSING, ">$missing" or die "$!"; open MISSING, ">$missing" or die "$!";
@ -79,12 +79,14 @@ GetUserDefaults();
GetULBBooksToProcess(); GetULBBooksToProcess();
ReadExceptions(); ReadExceptions();
close LOG; close LOG;
open LOG, ">:utf8", "Logs${d}tW_pairs_log.txt" or die "Logs${d}tW_pairs_log.txt: $!"; open LOG, ">:utf8", "Logs${d}2_tW_pairs_log.txt" or die "Logs${d}2_tW_pairs_log.txt: $!";
PairtWEntriesTotWPageAndUniqSNs(); PairtWEntriesTotWPageAndUniqSNs();
close LOG; close LOG;
open LOG, ">:utf8", "Logs${d}tWs_from_MAST_log.txt" or die "tWs_from_MAST_log.txt: $!"; open LOG, ">:utf8", "Logs${d}3_tWs_from_MAST_log.txt" or die "3_tWs_from_MAST_log.txt: $!";
GetRelevantSNsForEachVerse(); GetRelevantSNsForEachVerse();
LinkULBtoCV(); LinkULBtoCV();
close LOG;
open LOG, ">:utf8", "Logs${d}4_Process_log.txt" or die "4_Process_log.txt: $!";
ProcessEachVerse(); ProcessEachVerse();
say OUT $finalOutString; say OUT $finalOutString;
@ -124,6 +126,8 @@ sub GetUserDefaults {
($topTwDir, $topOTSourceLangDir, $topNTSourceLangDir) = ("$repoPath${d}$twPath", "$repoPath${d}MAST_HB", "$repoPath${d}OGNT"); ($topTwDir, $topOTSourceLangDir, $topNTSourceLangDir) = ("$repoPath${d}$twPath", "$repoPath${d}MAST_HB", "$repoPath${d}OGNT");
say LOG "\$topTwDir: $topTwDir\n\$topOTSourceLangDir: $topOTSourceLangDir\n\$topNTSourceLangDir: $topNTSourceLangDir ";
close $defaults; close $defaults;
} }
@ -144,12 +148,13 @@ sub GetULBBooksToProcess {
} }
$sourceFile = "$topSourceLangDir${d}$this_bk.xml"; $sourceFile = "$topSourceLangDir${d}$this_bk.xml";
say LOG $sourceFile;
push @fileList, $sourceFile; push @fileList, $sourceFile;
} }
} }
close $file; close $file;
#say LOG "\@fileList:\n@fileList"; say LOG "===\n\@fileList:\n@fileList\n===\n";
} }
sub ReadExceptions { sub ReadExceptions {
@ -162,23 +167,24 @@ sub ReadExceptions {
#say LOG $line; #say LOG $line;
my $rf; my $rf;
if ($line =~ /^([^#\n][^\t\n]*)\t([GH]\d+)\t\|\|$/) { if ($line =~ /^([^#\n][^\t\n]*)\t([GH]\d+)\t\|\|$/) {
my ($oldNew) = ($2); my ($SNtoSkip) = ($2);
$rf = $1; $rf = $1;
#say LOG "\$line: $line, \$rf: $rf, \$oldNew: $oldNew"; say LOG "<1>\t\$line: $line, \$rf: $rf, \$SNtoSkip: $SNtoSkip";
($deleteNum{$rf}) .= "$oldNew√"; ($deleteNum{$rf}) .= "$SNtoSkip√";
$specifiedText{$rf} = 1; $specifiedText{$rf} = 1;
#say LOG "\$specifiedText{$rf}: $specifiedText{$rf}"; #say LOG "\$specifiedText{$rf}: $specifiedText{$rf}";
} elsif ($line =~ /^([^#\n][^\t\n]*)\t(\d+\t\d+)/) { } elsif ($line =~ /^([^#\n][^\t\n]*)\t([GH]\d+\t[GH]\d+)/) {
my ($oldNew) = ($2); my ($oldNew) = ($2);
$rf = $1; $rf = $1;
#say LOG "\$line: $line, \$rf: $rf, \$oldNew: $oldNew"; say LOG "<2>\t\$line: $line, \$rf: $rf, \$oldNew: $oldNew";
($adjust{$rf}) .= "$oldNew√"; ($adjust{$rf}) .= "$oldNew√";
$specifiedText{$rf} = 1; $specifiedText{$rf} = 1;
} }
elsif ($line =~ /^([^#\n\t][^\t\n]*)\t(.\d+)\t([^\t\n]*)\t([^\t\n]*)$/) { elsif ($line =~ /^([^#\n\t][^\t\n]*)\t(.\d+)\t([^\t\n]*)\t([^\t\n]*)$/) {
my ($rf, $sn, $snippet, $page) = ($1, $2, $3, $4); my ($rf, $sn, $snippet, $page) = ($1, $2, $3, $4);
#say LOG "\$line: $line, \$rf: $rf, \$oldNew: $oldNew"; say LOG "<3>\t\$rf: $rf, \$sn: $sn, \$snippet: $snippet, \$page: $page ";
$specifiedEntries{$rf} .= "$sn≈$snippet≈$page√"; $specifiedEntries{$rf} .= "$sn≈$snippet≈$page√";
$relevantSNsInCV{$rf} =~ s/$sn√?//;
$specifiedText{$rf} = 1; $specifiedText{$rf} = 1;
} }
@ -206,7 +212,7 @@ sub PairtWEntriesTotWPageAndUniqSNs {
if ($file =~ /\/([^\/]*)\/[^\/]*\.md/) { if ($file =~ /\/([^\/]*)\/[^\/]*\.md/) {
$dir{$shortFile} = $1 $dir{$shortFile} = $1
} }
say LOG "<0>\$shortFile: $shortFile\t\$dir{$shortFile}: $dir{$shortFile}"; say LOG "<4>\$shortFile: $shortFile\t\$dir{$shortFile}: $dir{$shortFile}";
#say "|$shortFile|"; die; #say "|$shortFile|"; die;
#if ($shortFile =~ /^(kt|names)/) { #if ($shortFile =~ /^(kt|names)/) {
#my $fileText = read_file("$file", binmode => 'utf8'); #my $fileText = read_file("$file", binmode => 'utf8');
@ -297,12 +303,13 @@ sub GetRelevantSNsForEachVerse {
if ($sourceFile =~ /(..)-...\.xml$/) { if ($sourceFile =~ /(..)-...\.xml$/) {
$hg = "H" if ($1 < 40); $hg = "H" if ($1 < 40);
} }
#say LOG "opening \$sourceFile: $sourceFile"; say LOG "opening \$sourceFile: $sourceFile";
open IN, "$sourceFile" or die "$sourceFile can't be opened\n\n"; open IN, "$sourceFile" or die "$sourceFile can't be opened\n\n";
my ($thisBook, $thisChap, $thisVers, $thisCV); my ($thisBook, $thisChap, $thisVers, $thisCV);
my (@pages); my (@pages);
while (<IN>) { while (<IN>) {
chomp; chomp;
say LOG "<\@>\t$_";
if (/<verse osisID="([^\.]*).(\d+).(\d+)">/) { if (/<verse osisID="([^\.]*).(\d+).(\d+)">/) {
#say LOG "$thisCV: \$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}";# Making sure previous verse is populated #say LOG "$thisCV: \$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}";# Making sure previous verse is populated
my ($bk, $ch, $vs) = ($1, $2, $3); my ($bk, $ch, $vs) = ($1, $2, $3);
@ -315,15 +322,20 @@ sub GetRelevantSNsForEachVerse {
#say LOG "##\t$bk $ch:$vs, $thisCV"; #say LOG "##\t$bk $ch:$vs, $thisCV";
} }
else { else {
s/(lemma=").*?(\d+).*?("\n)/$1$2$3/; if (/lemma="([^"]*)"/) {
while (/<w lemma="(\d+)"/g) { my $gist = $1;
#say LOG $_; say LOG "<\@\@>\t\$gist: $gist";
my ($thisSN) = ($hg . $1); if ($gist =~ /\d+/) {
#say LOG "\t\$thisSN: $thisSN, \$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}"; s/(lemma=")[^\d]*?(\d+)[^\d]*?(")/$1$2$3/;
if (exists $relevantSNs{$thisSN}) { while (/<w lemma="(\d+)"/g) {
$relevantSNsInCV{$thisCV} .= "$thisSN√" unless ($relevantSNsInCV{$thisCV} =~ /\b$thisSN\b/); my ($thisSN) = ($hg . $1);
say LOG "<\@\@\@>\t\$thisSN: $thisSN, \$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}";
if (exists $relevantSNs{$thisSN}) {
$relevantSNsInCV{$thisCV} .= "$thisSN√" unless ($relevantSNsInCV{$thisCV} =~ /\b$thisSN\b/);
}
say LOG ">\t\$thisSN: $thisSN, \$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}";
}
} }
#say LOG ">\t\$thisSN: $thisSN, \$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}";
} }
} }
} }
@ -348,16 +360,18 @@ sub ProcessEachVerse {
foreach my $key (sort keys %orderRef) { foreach my $key (sort keys %orderRef) {
# for each verse # for each verse
my ($thisCV) = ($orderRef{$key}); my ($thisCV) = ($orderRef{$key});
say LOG "\n<1>\n$thisCV\t$ULBtext{$thisCV}\n\$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}\t\$deleteNum{$thisCV}: $deleteNum{$thisCV}, \$specifiedEntries{$thisCV}: $specifiedEntries{$thisCV}"; say LOG "\n<5>\n$thisCV\t$ULBtext{$thisCV}\n\t\$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}\n\t\$deleteNum{$thisCV}: $deleteNum{$thisCV},\n\t\$specifiedEntries{$thisCV}: $specifiedEntries{$thisCV}";
($relevantSNsInCV{$thisCV}) = DeleteSpecifiedWords ($relevantSNsInCV{$thisCV}, $specifiedEntries{$thisCV});
($relevantSNsInCV{$thisCV}) = DeleteObviatedSNs($relevantSNsInCV{$thisCV}, $deleteNum{$thisCV}); ($relevantSNsInCV{$thisCV}) = DeleteObviatedSNs($relevantSNsInCV{$thisCV}, $deleteNum{$thisCV});
# delete obviated SNs # delete obviated SNs
say LOG "\t\$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}"; say LOG "<6>\t\$relevantSNsInCV{$thisCV}: $relevantSNsInCV{$thisCV}";
my $processSequence = "$specifiedEntries{$thisCV}√$relevantSNsInCV{$thisCV}"; my $processSequence = "$specifiedEntries{$thisCV}√$relevantSNsInCV{$thisCV}";
$processSequence =~ s/√+/√/g; $processSequence =~ s/√+/√/g;
$processSequence =~ s/^√+//; $processSequence =~ s/^√+//;
say LOG "\t\t\$processSequence: $processSequence"; say LOG "\t\$processSequence: $processSequence";
$finalOutString .= ExecuteProcessSequence($thisCV, $processSequence, $ULBtext{$thisCV}); $finalOutString .= ExecuteProcessSequence($thisCV, $processSequence, $ULBtext{$thisCV});
@ -365,9 +379,22 @@ sub ProcessEachVerse {
} }
sub DeleteSpecifiedWords {
my ($sns, $toDelete) = @_;
say LOG "<5.1>\t\$sns:\t\t$sns\n\t\t\$toDelete:\t$toDelete";
my @delete = split /√/, $toDelete;
foreach my $one (@delete) {
say LOG "<5.1.1>\t\$one: $one";
$one =~ s/^([^≈]*)≈.*$/$1/;
say LOG "<5.1.2>\t\$one: $one";
$sns =~ s/^(.*)$one(√)?(.*)$/$1$2$3/;
}
return $sns;
}
sub DeleteObviatedSNs { sub DeleteObviatedSNs {
my ($sns, $toDelete) = @_; my ($sns, $toDelete) = @_;
my @sns = split /√/, $sns; say LOG "<5.2>\t\$sns:\t\t$sns\n\t\t\$toDelete:\t$toDelete";
my @delete = split /√/, $toDelete; my @delete = split /√/, $toDelete;
foreach my $one (@delete) { foreach my $one (@delete) {
$sns =~ s/^(.*)$one(√)?(.*)$/$1$2$3/; $sns =~ s/^(.*)$one(√)?(.*)$/$1$2$3/;
@ -379,7 +406,7 @@ sub ExecuteProcessSequence {
my ($tempText, $thisCVOutString, $position, $outputFormRef) = ($trueText, ""); my ($tempText, $thisCVOutString, $position, $outputFormRef) = ($trueText, "");
my (%snippetSequence); my (%snippetSequence);
my (@SNsequence) = split /√/, $sequence; my (@SNsequence) = split /√/, $sequence;
say LOG "$ref: @SNsequence"; say LOG "$ref:\n@SNsequence";
if ($ref =~ /^([^:]*) (\d+):(\d+)/) { if ($ref =~ /^([^:]*) (\d+):(\d+)/) {
$outputFormRef = "$1,$2,$3" $outputFormRef = "$1,$2,$3"
} }
@ -388,13 +415,15 @@ sub ExecuteProcessSequence {
# for each relevant SN in verse # for each relevant SN in verse
# for each tW entry # for each tW entry
# if specified tW # if specified tW
say LOG "=====\n\$ref: $ref\t\$candidate: $candidate\t\$entriesThisSN{$candidate}: $entriesThisSN{$candidate}"; say LOG "=====\n\$ref: $ref\t\$candidate: $candidate\t\$entriesThisSN{$candidate}: $entriesThisSN{$candidate}\n$tempText";
my ($found, $sn, $ulbWord, $tWpage); my ($found, $sn, $ulbWord, $tWpage);
if ($candidate =~ /([^≈]*)≈([^≈]*)≈([^≈]*)/) { if ($candidate =~ /([^≈]*)≈([^≈]*)≈([^≈]*)/) {
# get position in true text to array # get position in true text to array
# delete found text from temp text # delete found text from temp text
($sn, $ulbWord, $tWpage) = ($1,$2,$3); ($sn, $ulbWord, $tWpage) = ($1,$2,$3);
while ($ulbWord =~ s/^(.*) \.\.\. (.*) \.\.\. (.*)/($1)\\b(.*?)\\b($2)\\b(.*?)\\b($3)/) {}
while ($ulbWord =~ s/^(.*) \.\.\. (.*)/($1)\\b(.*?)\\b($2)/) {} while ($ulbWord =~ s/^(.*) \.\.\. (.*)/($1)\\b(.*?)\\b($2)/) {}
say LOG "<A>\t\$ulbWord: $ulbWord";
if ($tempText =~ s/^(.*)\b$ulbWord\b(.*)$/$1$2/) { if ($tempText =~ s/^(.*)\b$ulbWord\b(.*)$/$1$2/) {
$position = length $1; $position = length $1;
$snippetSequence{$position} = "$ulbWord,$dir{$tWpage},$tWpage"; $snippetSequence{$position} = "$ulbWord,$dir{$tWpage},$tWpage";
@ -416,16 +445,18 @@ sub ExecuteProcessSequence {
# get ULB snippet to verse match list # get ULB snippet to verse match list
# get position in true text to array # get position in true text to array
# delete found text from temp text # delete found text from temp text
if ($thisEntry =~ /\(\.\*\?\)/ && $tempText =~ s/^(.*)\b($thisEntry)\b(.*)$/$1$3/i) { if ($thisEntry =~ /\(\.\*\?\)/ && $tempText =~ /$thisEntry/i) {
print LOG "<2>\t\$thisEntry |$thisEntry| is found in the first test\n___"; say LOG "<7>\t\$thisEntry |$thisEntry| is found in the first test";
$tempText =~ s/^(.*)\b$thisEntry\b(.*)$/$1$2/; if ($tempText =~ s/^(.*)\b($thisEntry)\b(.*)$/$1$3$4/i) {
say LOG "<7.1>\t\$1: $1 \$2: $2 \$3: $3 \$4: $4\t\$5: $5";
}
if ($trueText =~ /^(.*)\b($thisEntry)\b.*$/) { if ($trueText =~ /^(.*)\b($thisEntry)\b.*$/) {
$position = length $1; $position = length $1;
} }
$snippetSequence{$position} = "$thisEntry,$dir{$pagesThisEntry{$thisEntry}},$pagesThisEntry{$thisEntry}"; $snippetSequence{$position} = "$thisEntry,$dir{$pagesThisEntry{$thisEntry}},$pagesThisEntry{$thisEntry}";
$found = 1; $found = 1;
goto Breakout; goto Breakout;
} }
elsif ($tempText =~ s/\b($thisEntry)[^\w']//i || $tempText =~ s/\b($thisEntry)["']//i || $tempText =~ s/["']($thisEntry)\b//i) { elsif ($tempText =~ s/\b($thisEntry)[^\w']//i || $tempText =~ s/\b($thisEntry)["']//i || $tempText =~ s/["']($thisEntry)\b//i) {
say LOG "\$thisEntry |$thisEntry| is found in the second test\n--- say LOG "\$thisEntry |$thisEntry| is found in the second test\n---
"; ";