From 91d789da65d6c53cf78ec62de3f8a19d1133f41e Mon Sep 17 00:00:00 2001 From: Henry Whitney Date: Tue, 28 Jul 2020 17:34:00 -0400 Subject: [PATCH] Now needed for tagging --- Build_OL_files_from_XML.pl | 180 ++++++++ Build_ULB_XML_for_Tagging.pl | 126 ++++++ Check_ULB.pl | 113 +++++ ...s_from_unified_ULB_XML_and_tWs_and_OGNT.pl | 410 ++++++++++++++++++ Find_first_occurrence_of_proper_nouns.pl | 59 +++ Find_first_occurrence_of_tWs.pl | 365 ++++++++++++++++ Get_Strong_variants.pl | 24 + Inventory_usfm_markers.pl | 38 ++ 8 files changed, 1315 insertions(+) create mode 100644 Build_OL_files_from_XML.pl create mode 100644 Build_ULB_XML_for_Tagging.pl create mode 100644 Check_ULB.pl create mode 100644 Construct_tagged_ULB_XML_files_from_unified_ULB_XML_and_tWs_and_OGNT.pl create mode 100644 Find_first_occurrence_of_proper_nouns.pl create mode 100644 Find_first_occurrence_of_tWs.pl create mode 100644 Get_Strong_variants.pl create mode 100644 Inventory_usfm_markers.pl diff --git a/Build_OL_files_from_XML.pl b/Build_OL_files_from_XML.pl new file mode 100644 index 00000000..9a5fcd8b --- /dev/null +++ b/Build_OL_files_from_XML.pl @@ -0,0 +1,180 @@ +# Builds easily searchable file from current OGNT and MAST-HB XML file +# Takes verse at a time from slurped file +# Useful for Mine routine building MAST PDF +use 5.18.0; +use File::Slurp; +use File::Find ; +use Cwd ; +use utf8; +#use open IN => ":utf8", OUT => ":utf8"; +use open IO => ":utf8"; +open LOG, ">:utf8", "Logs/log.txt" or die; +open OUT, ">:utf8", "Output/Original_languages.txt" or die; + +my (@folders) = ("/Users/Henry/Documents/WACS/MAST_HB", "/Users/Henry/Documents/WACS/OGNT"); +my (%order, %long); +my $outText; + +while () { + chomp; + if (/^([^\t]*)\t([^\t]*)\t(.*)$/) { + $order{$1} = $3; + $long{$2} = $3; + } +} +#foreach my $key (sort keys %long) { +# say LOG $key . "\t" . $long{$key}; +#} + +foreach my $folder (@folders) { + say LOG "$folder"; + #system "cd $folder;xml val *.xml;echo 'Continue? (Control + C to quit, Enter to continue)';read name;"; + my ($topDir, $lang) = ($folder, "H"); + + if ($folder =~ /OGNT/) {$lang = "G"} + + my @filesToRun = (); + my $filePattern = '*.xml' ; + find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ; + @filesToRun = sort @filesToRun; + foreach my $file ( @filesToRun ) { + say LOG $file; + my $fileText = read_file("$file", binmode => 'utf8'); + my ($bk, $ch, $vs, $lemma, $word, $nbk, $nch, $nvs, $previous, $current, $interruption, $next, $verse, $thisBookText, $prevVsText, $holdText, $thisVsText, $nextVsText, $oldHold, $shortCur, $shortIntr); + while ($fileText =~ //spg) { + $verse = $&; + say LOG "\$1: $1, \$2: $2, \$3: $3, \$4: $4"; + ($shortCur, $bk, $ch, $vs) = ($1, $long{$2}, $3, $4); + $previous = $current; + $current = "$bk $ch:$vs"; + say LOG "<0>\t\$current: $current"; + my $verseText; + + if ($verse =~ /KJV:(([^\.]*)\.([^\.]*).([^<]*))<\/note>/p) { # Occurs only in OT + say LOG "<1>\t$&"; + ($shortIntr, $nbk, $nch, $nvs) = ($1, $long{$2}, $3, $4); + $interruption = "$nbk $nch:$nvs"; + say LOG "<2>\t\$interruption: $interruption (of $current)"; + if ($verse =~ /\n[^<\n]*KJV:$shortIntr<\/note>/) { # Complete renumber of verse + say LOG "<3>\t$&"; + $current = $interruption; + $verseText = GetContent($verse); + $verseText = "$current\t$oldHold$verseText"; + $oldHold = ""; + } + elsif ($interruption ne $current && $verse =~ /KJV:([^\.]*)\.([^\.]*).([^<]*)<\/note>/p) { # New verse begins here + say LOG "<4>\t$&"; + ($thisVsText, $nextVsText) = (${^PREMATCH}, ${^POSTMATCH}); + $thisVsText = GetContent($thisVsText); + $nextVsText = GetContent($nextVsText); + $outText .= "$oldHold\n$current\t$thisVsText "; + $oldHold = "$nextVsText "; + } + elsif ($interruption eq $current && $verse =~ /KJV:([^\.]*)\.([^\.]*).([^<]*)<\/note>/p) { # Previous verse continues here + say LOG "<5>\t$&"; + ($prevVsText, $thisVsText) = (${^PREMATCH}, ${^POSTMATCH}); + $prevVsText = GetContent($prevVsText); + $thisVsText = GetContent($thisVsText); + $verseText .= "$oldHold\n$current\t$thisVsText"; + $oldHold = ""; + } + } + else { + # The whole verse should be processed in one piece + #$verseText = GetContent($verse); + #$verseText = "$current\$tverseText" + } + #$thisBookText .= "\n$verseText"; + #$oldHold = $holdText + } + #$thisBookText =~ s/]*>([^<]*)<\/w>/) { + my ($lemma, $OL) = ($1, $2); + $lemma =~ s/[^\d"]*(\d+)[^\d"]*/$1/; + $returnText .= "$OL <$lemma> " + } + return $returnText +} + +__DATA__ +01 gen Genesis +02 exo Exodus +03 lev Leviticus +04 num Numbers +05 deu Deuteronomy +06 jos Joshua +07 jdg Judges +08 rut Ruth +09 1sa 1 Samuel +10 2sa 2 Samuel +11 1ki 1 Kings +12 2ki 2 Kings +13 1ch 1 Chronicles +14 2ch 2 Chronicles +15 ezr Ezra +16 neh Nehemiah +17 est Esther +18 job Job +19 psa Psalms +20 pro Proverbs +21 ecc Ecclesiastes +22 sng Song of Solomon +23 isa Isaiah +24 jer Jeremiah +25 lam Lamentations +26 ezk Ezekiel +27 dan Daniel +28 hos Hosea +29 jol Joel +30 amo Amos +31 oba Obadiah +32 jon Jonah +33 mic Micah +34 nam Nahum +35 hab Habakkuk +36 zep Zephaniah +37 hag Haggai +38 zec Zechariah +39 mal Malachi +41 mat Matthew +42 mrk Mark +43 luk Luke +44 jhn John +45 act Acts +46 rom Romans +47 1co 1 Corinthians +48 2co 2 Corinthians +49 gal Galatians +50 eph Ephesians +51 php Philippians +52 col Colossians +53 1th 1 Thessalonians +54 2th 2 Thessalonians +55 1ti 1 Timothy +56 2ti 2 Timothy +57 tit Titus +58 phm Philemon +59 heb Hebrews +60 jas James +61 1pe 1 Peter +62 2pe 2 Peter +63 1jn 1 John +64 2jn 2 John +65 3jn 3 John +66 jud Jude +67 rev Revelation diff --git a/Build_ULB_XML_for_Tagging.pl b/Build_ULB_XML_for_Tagging.pl new file mode 100644 index 00000000..5c86d5e4 --- /dev/null +++ b/Build_ULB_XML_for_Tagging.pl @@ -0,0 +1,126 @@ +# Creates workable ULB.xml file that has all USFM markers in place. + +use 5.18.0; +use File::Slurp; +use File::Find ; +use Cwd ; +use utf8; +#use open IN => ":utf8", OUT => ":utf8"; +use open IO => ":utf8"; + +open(LOG, ">Logs/Log.txt") or die "$!"; +open(OUT, ">/Users/Henry/Documents/WACS/en_ulb_tagged/ULB_xml/ULB.xml") or die "$!"; +say OUT ""; + +my ($topDir, $outDir) = ("/Users/Henry/Documents/WACS/en_ulb", "/Users/Henry/Documents/WACS/en_ulb_tagged/ULB_xml"); + +my @filesToRun = (); +my $filePattern = '\.usfm' ; +#my $filePattern = '67-REV\.usfm' ; +my $file; +find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ; + +@filesToRun = sort @filesToRun; + +ReadFiles(); + +say OUT ""; +close OUT; +close LOG; + +say "\nDone."; +# ===== +sub ReadFiles { + + foreach $file ( @filesToRun ) { + say $file; + my @array; + my $fileText = read_file("$file", binmode => 'utf8'); + $fileText =~ s/[ \n]+$//; + say LOG "|$fileText|"; + +#Delete \n + my ($book, $chap, $vers, $chapStart); + if ($fileText =~ /\\h ([^\n]*)/) { + $book = $1 + } + #say LOG $book; + $fileText =~ s/\n/ /g; + $fileText =~ s/ / /g; + $fileText =~ s/\\s5/\n$&/g; + $fileText =~ s/\\v/√/g; + while ($fileText =~ s/(√[^√\n]*)(√)/$1\n$2/) {} + $fileText =~ s/√/\\v/g; + $fileText =~ s/(\\id[^\n]*)\n/\t\t$1<\/heading>\n/; + $fileText =~ s/ +\n/\n/g; + $fileText =~ s/(\\(q\d?|pi?|m|n?b))\n/\n$1 /g; + #say LOG $fileText; + @array = split /\n/, $fileText; + $fileText = ""; + foreach my $line (@array) { + chomp; + if ($line =~ //) {$book = $1;} + if ($line =~ /\\c (\d+).* \\v (\d+)/) { + ($chap, $vers) = ($1, $2); + $line = "\t\t\n\t\t\t$line"; + $line = "\t\t\n$line" if $chapStart; + $chapStart = 1; + } + elsif ($line =~ /\\v (\d+)/) { + $vers = $1; + $line = "\t\t\t$line" + } + #say LOG "===\n\n$line"; + $line =~ s/(]*>)(.*\\v \d+ )(.*)(<\/verse>)/$1\n\t\t\t\t$2<\/preText>\n\t\t\t\t$3<\/text>\n\t\t\t$4/s; + #say LOG "===\n\n$line"; + if ($line =~ /.*<\/text>/p) { + say LOG "<-0>\t$line"; + my ($pre, $match, $post) = (${^PREMATCH}, ${^MATCH}, ${^POSTMATCH}); + #say LOG "<-1>\t\$pre: $pre,\n\$match: $match,\n\$post: $post"; + $match = TagInternalUSFM ($match); + $line = $pre . $match . $post; + } + + say LOG "---\n\n$line\n==="; + $line =~ s# +\n$fileText\t\t\n\t"; + } + +} + +sub TagInternalUSFM { + my ($line, $placeNum) = ($_[0], 1); + my %places; + #say LOG "Tagging internal USFM in \$line $line."; + while ($line =~ /(.*)(\\f .*?\\f\*)(.*<\/text>)/g) { + #say LOG "<+1>\t$2"; + $line =~ s/(.*)(\\f .*?\\f\*)(.*<\/text>)/$1$3/; + $places{$placeNum} = $2; + $placeNum ++; + } + #say LOG "<+2>\t$line"; + while ($line =~ /(.*)(\\qs .*?\\qs\*)(.*<\/text>)/g) { + #say LOG "<+3>\t$2"; + $line =~ s/(.*)(\\qs .*?\\qs\*)(.*<\/text>)/$1$3/; + $places{$placeNum} = $2; + $placeNum ++; + } + #say LOG "<+4>\t$line"; + while ($line =~ /(.*)(\\([bm]|pi?|q\d?|s2))( .*<\/text>)/g) { + #say LOG "<+5>\t$2"; + $line =~ s/(.*)(\\([bm]|pi?|q\d?|s2))(.*<\/text>)/$1$4/; + $places{$placeNum} = $2; + $placeNum ++; + } + #say LOG "<+6>\t$line"; + $line =~ s/ / /g; + #say LOG "<+7>\t$line"; + foreach my $place (sort keys %places) { + #say LOG "<+8>\tReplacing with $places{$place}<\/usfm> in\n$line."; + unless ($line =~ s//$places{$place}<\/usfm>/) {die} + } + say LOG "<+9>\t$line"; + return $line; +} diff --git a/Check_ULB.pl b/Check_ULB.pl new file mode 100644 index 00000000..0fe499c9 --- /dev/null +++ b/Check_ULB.pl @@ -0,0 +1,113 @@ +# Checks ULB in ULB.xml against tagged ULB +use 5.18.0; +use File::Slurp; +use File::Find ; +use Cwd ; +use utf8; +#use open IN => ":utf8", OUT => ":utf8"; +use open IO => ":utf8"; + +open(LOG, ">Logs/Log.txt") or die "$!"; + +my ($ULBxml, $taggedULBDir) = ("/Users/Henry/Documents/WACS/Tips_and_Hacks/Tagged_OGNT/ULB_xml/ULB.xml", "/Users/Henry/Documents/WACS/Tips_and_Hacks/Tagged_OGNT/Manual_Tagging"); +my (@filesToRun) = (); +my %fullName; +my $filePattern = "\.xml" ; +my $file; + +my $xmlText = read_file("$ULBxml", binmode => 'utf8'); + +GetBooksToCheck(); +Compare(); + +sub Compare { + foreach my $file (@filesToRun) { + + say LOG "|$file|, |$taggedULBDir/$file.xml|"; + my $taggedText = read_file("$taggedULBDir/$file.xml", binmode => 'utf8'); + + GetGist($file, $taggedText); + } +} + +sub GetGist { + my ($fileName, $wholeTaggedText) = @_; + my ($verseRef, $standard, $tagged); + say LOG "|$fileName|, |$fullName{$fileName}|"; + while ($wholeTaggedText =~ /((.|\n)*?)(.*?)<\/preText>((.|\n)*?)\n\t+((/sg) { + my ($preText, $gist) = ($4, $7); + $verseRef = $1; + say LOG "\$verseRef: |$verseRef|"; + if ($xmlText =~ /\n\t+([^\n]*)<\/preText>\n\t+([^\n]*)<\/text>\n\t+<\/verse>/s) { + my ($standardPT, $standardT) = ($1, $2); + ($tagged) = Untag($preText, $gist); + #say LOG $tagged; + $standard = $standardPT . " " . $standardT; + $standard =~ s/<[^<>]*>//g; + $standard =~ s/ {2,}/ /g; + $standard =~ s/ +$//; + if ($standard ne $tagged) { + say LOG "\nMISMATCH:\n\$standard\n$standard\n\$tagged\n$tagged\n" + } + } + } +} + +sub Untag { + my ($pre, $txt) = ($_[0], $_[1]); + #say LOG "\$pre: $pre\n\$txt: $txt"; + $txt =~ s/[\t\n]/ /g; + $txt =~ s/(√|<[^<>]*>)//g; + $txt = $pre . " " . $txt; + $txt =~ s/ {2,}/ /g; + $txt =~ s/ +$//; + return $txt; +} + +sub GetBooksToCheck { + while () { + chomp; + unless (/^#/) { + if (/([^\t]*)\t([^\t]*)/) { + my ($file, $book) = ($1, $2); + say "|$file|"; + push @filesToRun, "$file"; + $fullName{$file} = $book; + } + } + } +} + + +close LOG; + +say "\nDone."; +# ===== +__DATA__ +#41-MAT Matthew +#42-MRK Mark +#43-LUK Luke +#44-JHN John +#45-ACT Acts +#46-ROM Romans +#47-1CO 1 Corinthians +#48-2CO 2 Corinthians +#49-GAL Galatians +#50-EPH Ephesians +#51-PHP Philippians +#52-COL Colossians +#53-1TH 1 Thessalonians +#54-2TH 2 Thessalonians +#55-1TI 1 Timothy +#56-2TI 2 Timothy +57-TIT Titus +#58-PHM Philemon +#59-HEB Hebrews +#60-JAS James +#61-1PE 1 Peter +#62-2PE 2 Peter +#63-1JN 1 John +#64-2JN 2 John +#65-3JN 3 John +#66-JUD Jude +#67-REV Revelation diff --git a/Construct_tagged_ULB_XML_files_from_unified_ULB_XML_and_tWs_and_OGNT.pl b/Construct_tagged_ULB_XML_files_from_unified_ULB_XML_and_tWs_and_OGNT.pl new file mode 100644 index 00000000..1ac4d163 --- /dev/null +++ b/Construct_tagged_ULB_XML_files_from_unified_ULB_XML_and_tWs_and_OGNT.pl @@ -0,0 +1,410 @@ +# Takes current tW entries and populates tagged OGNT XML +# +# This is the current best version +# Requires ULB that includes USFMs. + + +use 5.12.0; +use File::Slurp; +use File::Find ; +use Cwd ; +use utf8; +#use open IN => ":utf8", OUT => ":utf8"; +use open IO => ":utf8"; +$" = "\n"; + +mkdir "Logs"; +open(LOG, ">Logs/tW_pairs.txt") or die "$!"; +my $ULBfile = "/Users/Henry/Documents/WACS/en_ulb_tagged/ULB_xml/ULB.xml"; +my $topDirOGNT = "/Users/Henry/Documents/WACS/OGNT"; +#my $topDirOGNT = "/Users/Henry/Documents/WACS/en_ulb_tagged/Tag_test"; +my $topDirtW = "/Users/Henry/Documents/WACS/en_tw/bible"; +my ($outDir, $outFile) = ("/Users/Henry/Documents/WACS/en_ulb_tagged/Auto-tagged", ""); +my ($ULBText, $workText, $language); +my ($file); +my (%ULBtextThisVerse, %ULBpreTextThisVerse, %SNsThisVerse, %entriesThisSN, %longName); + +my @OGNTfilesToRun = (); +#my $filePattern = '\.xml' ; +my $filePattern = '52-COL\.xml' ; +find( sub { push @OGNTfilesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDirOGNT) ; + +say LOG "\@OGNTfilesToRun:\n@OGNTfilesToRun\n"; + +my @tWfilesToRun = (); +$filePattern = '.md' ; +find( sub { push @tWfilesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDirtW) ; + +Read_tW_Files(); + +close LOG; +open(LOG, ">Logs/Log.txt") or die "$!"; + +LongBookNames(); +Prepare_ULB_file(); +say LOG "Prepare_ULB_file done.\n\@OGNTfilesToRun:\n@OGNTfilesToRun\n"; + +ProcessXML(); +# put unused SN at end of verse + +close LOG; + +say "\nDone."; +# ===== +sub Read_tW_Files { + foreach $file ( @tWfilesToRun ) { + say LOG $file; + my (@sns); + my $entries; + my $fileText = read_file("$file", binmode => 'utf8'); + if ($fileText =~ /\* Strong's: ([^\n]*)\n/) { + my $sns = $1; + #say LOG "\t$sns"; + @sns = split /, /, $sns; + } + if ($fileText =~ /Forms Found in the English ULB:\n\n([^\n]*)\n/) { + $entries = $1; + die "$fileText" if $entries eq ""; + #say LOG "\t\t$entries" + } + foreach my $sn (@sns) { + $entriesThisSN{$sn} .= $entries . ", "; + #say LOG "\t\t\t$sn: $entriesThisSN{$sn}" + } + } + foreach my $sn (sort keys %entriesThisSN) { + #say LOG "$sn: $entriesThisSN{$sn}"; + my @entries = split /, /, $entriesThisSN{$sn}; + @entries = reverse sort { substr($a,0,1) <=> substr($b,0,1) + || length($a) <=> length($b) + || $a <=> $b } + @entries; + $entriesThisSN{$sn} = ""; + foreach my $slice (@entries) { + $entriesThisSN{$sn} .= "$slice, " + } + $entriesThisSN{$sn} =~ s/, $//; + say LOG "$sn: $entriesThisSN{$sn}"; + } + +} +sub LongBookNames { + while () { + chomp; + if (/([^\t]*)\t([^\t]*)\t([^\t]*)/) { + $longName{$2} = $3 + } + } +} +sub ProcessXML { + foreach my $file (@OGNTfilesToRun) { + my $greekText; + my $fileGist; + if ($file =~ /((..)....\.xml)/) { + ($fileGist, $language) = ($1, $2); + if ($language > 40) { + $language = "G" + } else {$language = "H"} + + } + say LOG "<0>\t$file \t$fileGist"; + open(OUT, ">$outDir/$fileGist") or die "$outDir/$fileGist: $!"; + my ($pre, $gist, $post, $bk, $ch, $vs, $thisVerse, $staticText, $residueText, $matchedLines, $flag, $thisVerseForOutput, + $linesWithRelevantSNs, $linesNotMatched, $orderedOutputLines, $linesToSkip, $thisPreText); + open (my $thisFile, "<:utf8", "$file") or die "$file:\n$!"; + my ($originalLinesCount, $rsnCount, $skipCount, $noRSNCount, $outCount); + while (my $thisLine = <$thisFile>) { + chomp $thisLine; + if ($thisLine =~ //) { + say LOG "<0.1>\t$thisLine"; + say LOG "<11>\n\$linesWithRelevantSNs\n$linesWithRelevantSNs\n\$linesToSkip\n$linesToSkip\$residueText\n$residueText"; + + ($matchedLines, $residueText, $linesNotMatched) = ProcessRelevantSNs($linesWithRelevantSNs, $staticText, $residueText); + say LOG "<14>\t\$matchedLines\n$matchedLines\n\$linesNotMatched\n$linesNotMatched"; + my %orderedLine; + $matchedLines =~ s/\n{2,}/\n/gs; + say LOG "<15\tBefore sort of \$matchedLines:\n$matchedLines\n"; + while ($matchedLines =~ /([^◊]*)◊(\d*)\n/g) { + $orderedLine{$2} = $1; + say LOG "<5>\t\$2: $2\t\$1: $1"; + } + $matchedLines = ""; + foreach my $line (sort {$a <=> $b} keys %orderedLine) { + say LOG "<5.5>\t\$line: $line\t\$orderedLine{$line}: $orderedLine{$line}"; + $matchedLines .= "$orderedLine{$line}\n" + } + chomp $matchedLines; + say LOG "<16>\tAfter sort of \$matchedLines:\n$matchedLines\n\$linesNotMatched\n$linesNotMatched"; + $residueText =~ s/.*?<\/usfm>//g; + $residueText =~ s/(^q | q$)//g; + $residueText =~ s/ {3,}/ /g; + $residueText =~ s/^ +//; + $residueText =~ s/ +$/$1/; + $greekText =~ s/^ +//; + $greekText =~ s/ +$/$1/; + $staticText =~ s/^ +//; + $staticText =~ s/ +$/$1/; + my $internalUSFM; + $internalUSFM .= "\t\t\t\t\t$&\n" while ($staticText =~ /.*?<\/usfm>/g); + $linesNotMatched =~ s/\n+$//; + $linesToSkip =~ s/\n+$//; + $matchedLines =~ s/^\n+//; + $internalUSFM =~ s/\n+$//; + say LOG "<17>\tAfter pruning \$matchedLines:\n$matchedLines\n\$linesNotMatched\n$linesNotMatched"; + say OUT "\t\t\t\t\t$greekText"; + say OUT "\t\t\t\t\t$thisPreText"; + say OUT "\t\t\t\t\t$staticText"; + say OUT "\t\t\t\t\t$residueText"; + say OUT "$matchedLines" unless ($matchedLines eq ""); + say OUT "$linesNotMatched" unless ($linesNotMatched eq ""); + say OUT "$internalUSFM" unless ($internalUSFM eq ""); + say OUT "$linesToSkip" if ($linesToSkip); + say OUT "$thisLine"; + ($originalLinesCount, $rsnCount, $skipCount, $noRSNCount, $outCount) = (); + ($thisVerseForOutput, $flag, $workText, $greekText, $linesNotMatched, $linesToSkip, $residueText, $orderedOutputLines, $linesWithRelevantSNs) = (); + ($linesToSkip) = (""); + } + elsif ($thisLine =~ /\t$thisLine"; + $originalLinesCount ++; + if ($thisLine =~ />([^\n<>]*)([^<]*)(<\/w>)/$1 text="$2">$3/; + if ($thisLine =~ /lemma="(\d+)"/) { + my $thisLemma = $language . $1; + if (exists $entriesThisSN{$thisLemma}) { + $rsnCount ++; + $linesWithRelevantSNs .= $thisLine . "\n"; + say LOG "<0.2.1>\t\$thisLemma: $thisLemma; line pushed to \$linesWithRelevantSNs"; + } + else { + $skipCount ++; + $thisLine =~ s/><\/w>/>√<\/w>/; + $linesToSkip .= "$thisLine\n"; + #say LOG "<0.2.2>\t\$thisLemma: $thisLemma; line pushed to \@LinesToSkip"; + } + } + } + elsif ($thisLine =~ //) { + say LOG "<0.3>\t$thisLine"; + ($bk, $ch, $vs) = ($1,$2,$3); + ($thisVerse, $greekText) = ("$longName{$bk} $ch:$vs", ""); + $staticText = $ULBtextThisVerse{$thisVerse}; + $residueText = "q $staticText q"; + $thisPreText = $ULBpreTextThisVerse{$thisVerse}; + say OUT "\t\t\t\t"; + ($flag) = (1); + } + else {say OUT $thisLine} + } + + close $thisFile; + close OUT; + } +} +sub ProcessRelevantSNs { + my ($relevantLines, $staticText, $residueText, $linesNotMatched) = (@_); + my ($matchedLines, $thisLine); + my @relevantLines = split /\n/, $relevantLines; + foreach my $line (@relevantLines) { + if ($line =~ /lemma="(\d+)"/) { + my $thisSN = $language . $1; + say LOG "<12>\t\$line: $line, \$thisSN: $thisSN, \$entriesThisSN{$thisSN}\n$entriesThisSN{$thisSN}"; + ($thisLine, $residueText, $linesNotMatched) = MatchAndPlace($line, $thisSN, $staticText, $residueText, $linesNotMatched); + $thisLine =~ s/[ \t]+$//; + $matchedLines .= $thisLine . "\n"; + $matchedLines =~ s/\n{2,}$/\n/s; + say LOG "<13>\t\$matchedLines\n$matchedLines\n\$linesNotMatched\n$linesNotMatched+++" + } + } + return ($matchedLines, $residueText, $linesNotMatched); +} +sub MatchAndPlace { + my ($line, $sn, $staticText, $workText, $linesNotMatched) = @_; + #say LOG "<8>\t\$line: $line \$sn: $sn \$workText\n$workText"; + my ($workEntry, $found, $matchedLines, $first, $second, $third, $firstLen, $secondLen, $thirdLen); + my @entries = split /, /, $entriesThisSN{$sn}; + foreach my $entry (@entries) { + my $entryType; + if ($entry =~ /^(.*) \.\.\. (.*) \.\.\. (.*)$/) { + ($first, $second, $third) = ($1, $2, $3); + ($firstLen, $secondLen, $thirdLen) = (length $first, length $second, length $third); + $workEntry = "\\b" . $first . "\\b" . ".*?" . "\\b" . $second . "\\b" . ".*?" . "\\b" . $third; + say LOG "<1a>\t\$first: $first, \$second: $second, \$third: $third, \$firstLen: $firstLen, \$secondLen,: $secondLen, \$thirdLen: $thirdLen \$entry: |$entry|\t\$workEntry: |$workEntry|"; + $entryType = 1; + } + elsif ($entry =~ /^(.*) \.\.\. (.*)$/) { + ($first, $second) = ($1, $2); + ($firstLen, $secondLen) = (length $first, length $second); + $workEntry = "\\b" . $first . "\\b" . ".*?" . "\\b" . $second . "\\b"; + say LOG "<2a>\t\$first: $first, \$second: $second, \$third: $third, \$firstLen: $firstLen, \$secondLen,: $secondLen, \$entry: |$entry|\t\$workEntry: |$workEntry|"; + $entryType = 2; + } + else {$workEntry = $entry;} + + my $foundText; + #say LOG "<8.1>\t\$entryType: $entryType\t\$entry: $entry\t\$workEntry: $workEntry"; + if ($workText =~ /\b$workEntry\b/p) { + say LOG "<8.1>Found: \t\$entryType: $entryType\t\$entry: $entry\t\$workEntry: $workEntry"; + ($foundText, $workText) = ($&, "${^PREMATCH}ı${^POSTMATCH}"); + my ($place, $foundTextLength, $replacementSpaces) = (length ${^PREMATCH}, length $foundText, ""); + $line =~ s/>$entry\n\$workText,: $workText, \$matchedLines:\n$matchedLines "; + + ($workText) = FixWorkText($line, $workText, $workEntry, $foundText, $foundTextLength, $first, $firstLen, $second, $secondLen, $third, $thirdLen); + + say LOG "<8.3>\n\$workText:\n$workText\n\$matchedLines:\n$matchedLines"; + + } + + else {$workText =~ s/ı/$replacementSpaces/;} + + $matchedLines .= "$line◊$place"; + + say LOG "<8.4>\tAfter found, new \$workText:\n$workText"; + $found = 1; + } + else { + #say LOG "\$workEntry $workEntry not found" + } + if ($found) { + last + } + } + unless ($found) { + $line =~ s/>?\$text:\n$text\n\t\t\$entry: $entry \$foundText: $foundText\t \$foundTextLength: $foundTextLength\t\$first: $first\t\$second: $second\t\$third: $third\n\$firstSpace: $firstSpace\t\$secondSpace: $secondSpace\t\$thirdSpace: $thirdSpace"; + if ($third) { + if ($foundText =~ /$first(.*)$second(.*)$third/) { + my ($firstGap, $secondGap) = ($1, $2); + my $repText = "$firstSpace$firstGap$secondSpace$secondGap$thirdSpace"; + say LOG "<9.1> \$repText: $repText"; + $text =~ s/ı/$repText/; + } + } + else { + if ($foundText =~ /$first(.*)$second/) { + my ($firstGap) = ($1); + say LOG "<9.2>\t\$firstSpace: |$firstSpace|\t\$firstGap: |$firstGap|\t\$secondSpace: |$secondSpace|"; + my $repText ="$firstSpace$firstGap$secondSpace"; + say LOG "<9.3> \$repText: |$repText|"; + $text =~ s/ı/$repText/; + } + } + return ($text) +} +sub Prepare_ULB_file { + + my $thisVerse; + #$ULBText = read_file($ULBfile, binmode => 'utf8'); + + #while ($ULBText =~ /\n(.*?)<\/preText>\n.*(.*?)<\/text>.*<\/verse>/sg) { + # ($ULBtextThisVerse{$1}, $ULBpreTextThisVerse{$1}) = ($3, $2); + #} + # + + open (my $file, "<:utf8", "$ULBfile") or die "$ULBfile:\n$!"; + + while (my $thisLine = <$file>) { + chomp $thisLine; + if ($thisLine =~ /verse name="(.*?)"/) { + $thisVerse = $1; + #say LOG "$thisVerse:\n$thisLine" + } + elsif ($thisLine =~ /(.*?)<\/preText>/) { + $ULBpreTextThisVerse{$thisVerse} = $1; + #say LOG "$thisVerse:\n$ULBpreTextThisVerse{$thisVerse}" + } + elsif ($thisLine =~ /(.*?)<\/text>/) { + $ULBtextThisVerse{$thisVerse} = $1; + #say LOG "$thisVerse:\n$ULBtextThisVerse{$thisVerse}" + } + } + + close $file; +} + +__DATA__ +01 gen Genesis +02 exo Exodus +03 lev Leviticus +04 num Numbers +05 deu Deuteronomy +06 jos Joshua +07 jdg Judges +08 rut Ruth +09 1sa 1 Samuel +10 2sa 2 Samuel +11 1ki 1 Kings +12 2ki 2 Kings +13 1ch 1 Chronicles +14 2ch 2 Chronicles +15 ezr Ezra +16 neh Nehemiah +17 est Esther +18 job Job +19 psa Psalms +20 pro Proverbs +21 ecc Ecclesiastes +22 sng Song of Solomon +23 isa Isaiah +24 jer Jeremiah +25 lam Lamentations +26 ezk Ezekiel +27 dan Daniel +28 hos Hosea +29 jol Joel +30 amo Amos +31 oba Obadiah +32 jon Jonah +33 mic Micah +34 nam Nahum +35 hab Habakkuk +36 zep Zephaniah +37 hag Haggai +38 zec Zechariah +39 mal Malachi +41 mat Matthew +42 mrk Mark +43 luk Luke +44 jhn John +45 act Acts +46 rom Romans +47 1co 1 Corinthians +48 2co 2 Corinthians +49 gal Galatians +50 eph Ephesians +51 php Philippians +52 col Colossians +53 1th 1 Thessalonians +54 2th 2 Thessalonians +55 1ti 1 Timothy +56 2ti 2 Timothy +57 tit Titus +58 phm Philemon +59 heb Hebrews +60 jas James +61 1pe 1 Peter +62 2pe 2 Peter +63 1jn 1 John +64 2jn 2 John +65 3jn 3 John +66 jud Jude +67 rev Revelation diff --git a/Find_first_occurrence_of_proper_nouns.pl b/Find_first_occurrence_of_proper_nouns.pl new file mode 100644 index 00000000..0b26854d --- /dev/null +++ b/Find_first_occurrence_of_proper_nouns.pl @@ -0,0 +1,59 @@ +use 5.12.0; +use File::Slurp; +use File::Find ; +use Cwd ; + +my %location; + +open LOG, ">log/log.log" or die; + +open(IN, "/Users/Henry/Documents/WACS/Tips_and_Hacks/MAST_tW_PDF_Updater/FilesForUpdates/Temp/ULB_text.txt") or die "$!"; + +say "Reading ULB"; + +while () { + #print LOG "$_"; + chomp; + while (s/^([^\n\t]*)\t([^\n]*?)([A-Z][a-z]+(-[A-Z][a-z]+)?)/$1\t$2/) { + # say LOG $3; + unless (exists $location{$3}) {$location{$3} = $1} + } + +} + +close IN; + +say "Outputting hash"; + +open(OUT, ">out/results.txt") or die "$!"; + +foreach my $word (sort keys %location) { + say OUT "$word, $location{$word}"; +} + +close OUT; + +say "Deleting common words"; + +my $fileText = read_file("/Users/Henry/Documents/WACS/Tips_and_Hacks/MAST_tW_PDF_Updater/FilesForUpdates/Temp/ULB_text.txt", binmode => 'utf8'); + +foreach my $word (sort keys %location) { + my $temp = lc $word; + #say LOG $word . "\t" . $temp; + if ($fileText =~ /\b$temp\b/) { + delete $location{$word} + } +} + +say "Outputting final product"; + +open(OUT, ">out/results.txt") or die "$!"; + +foreach my $word (sort keys %location) { + say OUT "$word, $location{$word}"; +} + +close OUT; + + +close LOG; diff --git a/Find_first_occurrence_of_tWs.pl b/Find_first_occurrence_of_tWs.pl new file mode 100644 index 00000000..e922961d --- /dev/null +++ b/Find_first_occurrence_of_tWs.pl @@ -0,0 +1,365 @@ +use 5.12.0; +use File::Slurp; +use File::Find ; +use Cwd ; + +my %value; + +open LOG, ">log/log.log" or die; + +while () { + chomp; + if (/^(.*)$/) { + $value{$1} = $1 + } +} + +my $fileText = read_file("/Users/Henry/Documents/WACS/Tips_and_Hacks/MAST_tW_PDF_Updater/FilesForUpdates/Temp/ULB_text.txt", binmode => 'utf8'); + +$fileText = "\n" . $fileText; + +foreach my $thisWord (sort keys %value) { + print $thisWord . "\n"; + if ($fileText =~ /\n([^\n\t]*)\t[^\n]*$thisWord\b/) { + say LOG $thisWord . ", " . $1; + } + +} + +close LOG; + +__DATA__ +Aaron +Abel +Abiathar +Abijah +Abimelek +Abner +Abraham +Absalom +Adam +Adonijah +Ahab +Ahaz +Ahaziah +Ahijah +Ai +Amalek +Amaziah +Ammon +Amnon +Amorite +Amos +Amoz +Andrew +Annas +Antioch +Apollos +Aquila +Arabah +Arabia +Aram +Ararat +Artaxerxes +Asa +Asaph +Ashdod +Asher +Asherah +Ashkelon +Asia +Assyria +Athaliah +Azariah +Baal +Baasha +Babel +Babylon +Balaam +Barabbas +Barnabas +Bartholomew +Baruch +Bashan +Bathsheba +Beelzebul +Beersheba +Benaiah +Benjamin +Berea +Beth Shemesh +Bethany +Bethel +Bethlehem +Bethuel +Boaz +Caesar +Caesarea +Caiaphas +Cain +Caleb +Cana +Canaan +Capernaum +Carmel +Chaldea +Cilicia +Colossae +Corinth +Cornelius +Crete +Cush +Cyprus +Cyrene +Cyrus +Damascus +Dan +Daniel +Darius +David +Delilah +Eden +Edom +Egypt +Ekron +Elam +Eleazar +Eliakim +Elijah +Elisha +Elizabeth +En Gedi +Enoch +Ephesus +Ephraim +Ephrath +Esau +Esther +Ethiopia +Euphrates River +Eve +Ezekiel +Ezra +Gabriel +Gad +Galatia +Galilee +Gath +Gaza +Gerar +Geshur +Gethsemane +Gibeah +Gibeon +Gideon +Gilead +Gilgal +Girgashites +Golgotha +Goliath +Gomorrah +Goshen +Greece +Greek +Habakkuk +Hagar +Haggai +Ham +Hamath +Hamor +Hananiah +Hannah +Haran +Hebron +Hermon +Herod +Herodias +Hezekiah +Hilkiah +Hittite +Hivite +Horeb +Hosea +Hoshea +Iconium +Isaac +Isaiah +Ishmael +Issachar +Jacob +James +Japheth +Jebus +Jehoiachin +Jehoiada +Jehoiakim +Jehoram +Jehoshaphat +Jehu +Jephthah +Jeremiah +Jericho +Jeroboam +Jerusalem +Jesse +Jethro +Jezebel +Jezreel +Joab +Joash +Job +Joel +John +John Mark +Jonah +Jonathan +Joppa +Joram +Jordan River +Joseph +Joshua +Josiah +Jotham +Judah +Judas.*Iscariot +Judas son of James +Judea +Kadesh +Kedar +Kedesh +Kerethites +Kidron Valley +Korah +Laban +Lamech +Lazarus +Leah +Lebanon +Levi +Leviathan +Lot +Luke +Lystra +Maakah +Macedonia +Maker +Malachi +Manasseh +Martha +Mary +Mary +Mary.*Magdalene +Matthew +Mede +Melchizedek +Memphis +Meshech +Mesopotamia +Micah +Michael +Midian +Miriam +Mishael +Mizpah +Moab +Molech +Mordecai +Moses +Mount of Olives +Naaman +Nahor +Nahum +Naphtali +Nathan +Nazareth +Nebuchadnezzar +Negev +Nehemiah +Nile River +Nineveh +Noah +Obadiah +Omri +Paddan Aram +Paran +Paul +Peor +Perizzite +Persia +Peter +Pharaoh +Philip +Philippi +Philistia +Philistines +Phinehas +Phoenicia +Pilate +Pontus +Potiphar +Priscilla +Rabbah +Rachel +Rahab +Ramah +Ramoth +Rebekah +Rehoboam +Reuben +Rimmon +Rome +Ruth +Salt Sea +Samaria +Samson +Samuel +Sarah +Saul +Sea of Galilee +Sea of Reeds +Sennacherib +Seth +Sharon +Sheba +Shechem +Shem +Shiloh +Shimei +Shinar +Sidon +Silas +Simeon +Simon the Zealot +Sinai +Sodom +Solomon +Stephen +Sukkoth +Syria +Tamar +Tarshish +Tarsus +Terah +Thessalonica +Thomas +Timothy +Tirzah +Titus +Troas +Tubal +Tychicus +Tyre +Ur +Uriah +Uzziah +Vashti +Xerxes +Zacchaeus +Zadok +Zebedee +Zebulun +Zechariah +Zedekiah +Zephaniah +Zerubbabel +Zoar diff --git a/Get_Strong_variants.pl b/Get_Strong_variants.pl new file mode 100644 index 00000000..6dd04a4e --- /dev/null +++ b/Get_Strong_variants.pl @@ -0,0 +1,24 @@ +$pre = "https://biblehub.com/greek/"; +$var = "a, b, c, d, e, f, g, h"; +$post = ".htm"; +@array = split (/, /, $var); + +#$out = system "curl --fail https://biblehub.com/greek/2.htm"; +#print "\n\n\t\$out: $out"; + +open OUT, ">out/results.txt"; + +foreach $xx (1611 .. 1613) { + foreach $var (@array) { + $string = $pre . $xx . $var . $post; + $out = `curl $string`; + if ($out =~ /We're sorry, we were not able to find that passage./) { + last + } else { + print OUT "\$xx: $xx. \$string: $string."; + } + } + +} + +close OUT; \ No newline at end of file diff --git a/Inventory_usfm_markers.pl b/Inventory_usfm_markers.pl new file mode 100644 index 00000000..b46d50e8 --- /dev/null +++ b/Inventory_usfm_markers.pl @@ -0,0 +1,38 @@ +use 5.18.0; +use File::Slurp; +use File::Find ; +use Cwd ; + +my $topDir = "/Users/Henry/Documents/WACS/en_ulb"; + +my %found; +my @filesToRun = (); +my $filePattern = '*.usfm' ; + + +open LOG, ">/Users/Henry/Google Drive/WA/Scripts/out/log.log" or die; +open OUT, ">/Users/Henry/Google Drive/WA/Scripts/out/output.txt" or die; + +find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ; + +foreach my $file ( @filesToRun ) +{ + print "$file\n" ; + my $fileText = read_file("$file", binmode => 'utf8'); + $fileText =~ s/\n/ /g; + while ($fileText =~ /(\\[^ ]*) /g) { + my $code = $1; + unless (exists $found{$code}) { + $found{$code} = $code + } + } +} + +foreach my $code (sort keys %found) { + say OUT $code +} + +close OUT; +close LOG; + +print "\n\tDone."