forked from WycliffeAssociates/en_ulb_tagged
181 lines
4.6 KiB
Perl
181 lines
4.6 KiB
Perl
|
# Builds easily searchable file from current OGNT and MAST-HB XML file
|
||
|
# Takes verse at a time from slurped file
|
||
|
# Useful for Mine routine building MAST PDF
|
||
|
use 5.18.0;
|
||
|
use File::Slurp;
|
||
|
use File::Find ;
|
||
|
use Cwd ;
|
||
|
use utf8;
|
||
|
#use open IN => ":utf8", OUT => ":utf8";
|
||
|
use open IO => ":utf8";
|
||
|
open LOG, ">:utf8", "Logs/log.txt" or die;
|
||
|
open OUT, ">:utf8", "Output/Original_languages.txt" or die;
|
||
|
|
||
|
my (@folders) = ("/Users/Henry/Documents/WACS/MAST_HB", "/Users/Henry/Documents/WACS/OGNT");
|
||
|
my (%order, %long);
|
||
|
my $outText;
|
||
|
|
||
|
while (<DATA>) {
|
||
|
chomp;
|
||
|
if (/^([^\t]*)\t([^\t]*)\t(.*)$/) {
|
||
|
$order{$1} = $3;
|
||
|
$long{$2} = $3;
|
||
|
}
|
||
|
}
|
||
|
#foreach my $key (sort keys %long) {
|
||
|
# say LOG $key . "\t" . $long{$key};
|
||
|
#}
|
||
|
|
||
|
foreach my $folder (@folders) {
|
||
|
say LOG "$folder";
|
||
|
#system "cd $folder;xml val *.xml;echo 'Continue? (Control + C to quit, Enter to continue)';read name;";
|
||
|
my ($topDir, $lang) = ($folder, "H");
|
||
|
|
||
|
if ($folder =~ /OGNT/) {$lang = "G"}
|
||
|
|
||
|
my @filesToRun = ();
|
||
|
my $filePattern = '*.xml' ;
|
||
|
find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ;
|
||
|
@filesToRun = sort @filesToRun;
|
||
|
foreach my $file ( @filesToRun ) {
|
||
|
say LOG $file;
|
||
|
my $fileText = read_file("$file", binmode => 'utf8');
|
||
|
my ($bk, $ch, $vs, $lemma, $word, $nbk, $nch, $nvs, $previous, $current, $interruption, $next, $verse, $thisBookText, $prevVsText, $holdText, $thisVsText, $nextVsText, $oldHold, $shortCur, $shortIntr);
|
||
|
while ($fileText =~ /<verse osisID="((.*?)\.(\d+)\.(\d+))"(\n|.)*?<\/verse>/spg) {
|
||
|
$verse = $&;
|
||
|
say LOG "\$1: $1, \$2: $2, \$3: $3, \$4: $4";
|
||
|
($shortCur, $bk, $ch, $vs) = ($1, $long{$2}, $3, $4);
|
||
|
$previous = $current;
|
||
|
$current = "$bk $ch:$vs";
|
||
|
say LOG "<0>\t\$current: $current";
|
||
|
my $verseText;
|
||
|
|
||
|
if ($verse =~ /<note>KJV:(([^\.]*)\.([^\.]*).([^<]*))<\/note>/p) { # Occurs only in OT
|
||
|
say LOG "<1>\t$&";
|
||
|
($shortIntr, $nbk, $nch, $nvs) = ($1, $long{$2}, $3, $4);
|
||
|
$interruption = "$nbk $nch:$nvs";
|
||
|
say LOG "<2>\t\$interruption: $interruption (of $current)";
|
||
|
if ($verse =~ /<verse osisID="$shortCur">\n[^<\n]*<note>KJV:$shortIntr<\/note>/) { # Complete renumber of verse
|
||
|
say LOG "<3>\t$&";
|
||
|
$current = $interruption;
|
||
|
$verseText = GetContent($verse);
|
||
|
$verseText = "$current\t$oldHold$verseText";
|
||
|
$oldHold = "";
|
||
|
}
|
||
|
elsif ($interruption ne $current && $verse =~ /<note>KJV:([^\.]*)\.([^\.]*).([^<]*)<\/note>/p) { # New verse begins here
|
||
|
say LOG "<4>\t$&";
|
||
|
($thisVsText, $nextVsText) = (${^PREMATCH}, ${^POSTMATCH});
|
||
|
$thisVsText = GetContent($thisVsText);
|
||
|
$nextVsText = GetContent($nextVsText);
|
||
|
$outText .= "$oldHold\n$current\t$thisVsText ";
|
||
|
$oldHold = "$nextVsText ";
|
||
|
}
|
||
|
elsif ($interruption eq $current && $verse =~ /<note>KJV:([^\.]*)\.([^\.]*).([^<]*)<\/note>/p) { # Previous verse continues here
|
||
|
say LOG "<5>\t$&";
|
||
|
($prevVsText, $thisVsText) = (${^PREMATCH}, ${^POSTMATCH});
|
||
|
$prevVsText = GetContent($prevVsText);
|
||
|
$thisVsText = GetContent($thisVsText);
|
||
|
$verseText .= "$oldHold\n$current\t$thisVsText";
|
||
|
$oldHold = "";
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
# The whole verse should be processed in one piece
|
||
|
#$verseText = GetContent($verse);
|
||
|
#$verseText = "$current\$tverseText"
|
||
|
}
|
||
|
#$thisBookText .= "\n$verseText";
|
||
|
#$oldHold = $holdText
|
||
|
}
|
||
|
#$thisBookText =~ s/</<$lang/g;
|
||
|
#$outText .= "$thisBookText\n";
|
||
|
}
|
||
|
|
||
|
|
||
|
}
|
||
|
|
||
|
say OUT $outText;
|
||
|
|
||
|
close OUT;
|
||
|
close LOG;
|
||
|
|
||
|
print "\n\tDone.";
|
||
|
|
||
|
sub GetContent {
|
||
|
my ($text, $returnText) = ($_[0], "");
|
||
|
while ($text =~ /<w lemma="([^"]*)"[^>]*>([^<]*)<\/w>/) {
|
||
|
my ($lemma, $OL) = ($1, $2);
|
||
|
$lemma =~ s/[^\d"]*(\d+)[^\d"]*/$1/;
|
||
|
$returnText .= "$OL <$lemma> "
|
||
|
}
|
||
|
return $returnText
|
||
|
}
|
||
|
|
||
|
__DATA__
|
||
|
01 gen Genesis
|
||
|
02 exo Exodus
|
||
|
03 lev Leviticus
|
||
|
04 num Numbers
|
||
|
05 deu Deuteronomy
|
||
|
06 jos Joshua
|
||
|
07 jdg Judges
|
||
|
08 rut Ruth
|
||
|
09 1sa 1 Samuel
|
||
|
10 2sa 2 Samuel
|
||
|
11 1ki 1 Kings
|
||
|
12 2ki 2 Kings
|
||
|
13 1ch 1 Chronicles
|
||
|
14 2ch 2 Chronicles
|
||
|
15 ezr Ezra
|
||
|
16 neh Nehemiah
|
||
|
17 est Esther
|
||
|
18 job Job
|
||
|
19 psa Psalms
|
||
|
20 pro Proverbs
|
||
|
21 ecc Ecclesiastes
|
||
|
22 sng Song of Solomon
|
||
|
23 isa Isaiah
|
||
|
24 jer Jeremiah
|
||
|
25 lam Lamentations
|
||
|
26 ezk Ezekiel
|
||
|
27 dan Daniel
|
||
|
28 hos Hosea
|
||
|
29 jol Joel
|
||
|
30 amo Amos
|
||
|
31 oba Obadiah
|
||
|
32 jon Jonah
|
||
|
33 mic Micah
|
||
|
34 nam Nahum
|
||
|
35 hab Habakkuk
|
||
|
36 zep Zephaniah
|
||
|
37 hag Haggai
|
||
|
38 zec Zechariah
|
||
|
39 mal Malachi
|
||
|
41 mat Matthew
|
||
|
42 mrk Mark
|
||
|
43 luk Luke
|
||
|
44 jhn John
|
||
|
45 act Acts
|
||
|
46 rom Romans
|
||
|
47 1co 1 Corinthians
|
||
|
48 2co 2 Corinthians
|
||
|
49 gal Galatians
|
||
|
50 eph Ephesians
|
||
|
51 php Philippians
|
||
|
52 col Colossians
|
||
|
53 1th 1 Thessalonians
|
||
|
54 2th 2 Thessalonians
|
||
|
55 1ti 1 Timothy
|
||
|
56 2ti 2 Timothy
|
||
|
57 tit Titus
|
||
|
58 phm Philemon
|
||
|
59 heb Hebrews
|
||
|
60 jas James
|
||
|
61 1pe 1 Peter
|
||
|
62 2pe 2 Peter
|
||
|
63 1jn 1 John
|
||
|
64 2jn 2 John
|
||
|
65 3jn 3 John
|
||
|
66 jud Jude
|
||
|
67 rev Revelation
|