en_ulb_tagged/Build_OL_files_from_XML.pl

# Builds easily searchable file from current OGNT and MAST-HB XML file
# Takes verse at a time from slurped file
# Useful for Mine routine building MAST PDF
use 5.18.0;
use File::Slurp;
use File::Find ; 
use Cwd ; 
use utf8;
#use open IN => ":utf8", OUT => ":utf8";
use open IO => ":utf8";
open LOG, ">:utf8", "Logs/log.txt" or die;
open OUT, ">:utf8", "Output/Original_languages.txt" or die;

my (@folders) = ("/Users/Henry/Documents/WACS/MAST_HB", "/Users/Henry/Documents/WACS/OGNT");
my (%order, %long);
my $outText;

while (<DATA>) {
	chomp;
	if (/^([^\t]*)\t([^\t]*)\t(.*)$/) {
		$order{$1} = $3;
		$long{$2} = $3;
	}
}
#foreach my $key (sort keys %long) {
#	say LOG $key . "\t" . $long{$key};
#}

foreach my $folder (@folders) {
	say LOG "$folder";
	#system "cd $folder;xml val *.xml;echo 'Continue? (Control + C to quit, Enter to continue)';read name;";
	my ($topDir, $lang) = ($folder, "H");
	
	if ($folder =~ /OGNT/) {$lang = "G"}
	
	my @filesToRun = ();
	my $filePattern = '*.xml' ; 
	find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ;
	@filesToRun = sort @filesToRun;
	foreach  my $file ( @filesToRun  ) {
	   say LOG $file;
	   my $fileText = read_file("$file", binmode => 'utf8');
	   my ($bk, $ch, $vs, $lemma, $word, $nbk, $nch, $nvs, $previous, $current, $interruption, $next, $verse, $thisBookText, $prevVsText, $holdText, $thisVsText, $nextVsText, $oldHold, $shortCur, $shortIntr);
	   while ($fileText =~ /<verse osisID="((.*?)\.(\d+)\.(\d+))"(\n|.)*?<\/verse>/spg) {
			$verse = $&;
			say LOG "\$1: $1, \$2: $2, \$3: $3, \$4: $4";
			($shortCur, $bk, $ch, $vs) = ($1, $long{$2}, $3, $4);
			$previous = $current;
			$current = "$bk $ch:$vs";
			say LOG "<0>\t\$current: $current";
			my $verseText;
			
   			if ($verse =~ /<note>KJV:(([^\.]*)\.([^\.]*).([^<]*))<\/note>/p) { # Occurs only in OT
				say LOG "<1>\t$&";
				($shortIntr, $nbk, $nch, $nvs) = ($1, $long{$2}, $3, $4);
				$interruption = "$nbk $nch:$nvs";
				say LOG "<2>\t\$interruption: $interruption (of $current)";
				if ($verse =~ /<verse osisID="$shortCur">\n[^<\n]*<note>KJV:$shortIntr<\/note>/) { # Complete renumber of verse
					say LOG "<3>\t$&";
					$current = $interruption;
					$verseText = GetContent($verse);
					$verseText = "$current\t$oldHold$verseText";
					$oldHold = "";
				}
				elsif ($interruption ne $current && $verse =~ /<note>KJV:([^\.]*)\.([^\.]*).([^<]*)<\/note>/p) { # New verse begins here
					say LOG "<4>\t$&";
					($thisVsText, $nextVsText) = (${^PREMATCH}, ${^POSTMATCH});
		   			$thisVsText = GetContent($thisVsText);
					$nextVsText = GetContent($nextVsText);
					$outText .= "$oldHold\n$current\t$thisVsText ";
					$oldHold = "$nextVsText ";
				}
				elsif ($interruption eq $current && $verse =~ /<note>KJV:([^\.]*)\.([^\.]*).([^<]*)<\/note>/p) { # Previous verse continues here
					say LOG "<5>\t$&";
					($prevVsText, $thisVsText) = (${^PREMATCH}, ${^POSTMATCH});
					$prevVsText = GetContent($prevVsText);
					$thisVsText = GetContent($thisVsText);
					$verseText .= "$oldHold\n$current\t$thisVsText";
					$oldHold = "";
				}
			}
			else {
				# The whole verse should be processed in one piece
				#$verseText = GetContent($verse);
				#$verseText = "$current\$tverseText"
			}
			#$thisBookText .= "\n$verseText";
			#$oldHold = $holdText
	   }
	   #$thisBookText =~ s/</<$lang/g;
	   #$outText .= "$thisBookText\n";
	} 


}

say OUT $outText;

close OUT;
close LOG;

print "\n\tDone.";

sub GetContent {
	my ($text, $returnText) = ($_[0], "");
	while ($text =~ /<w lemma="([^"]*)"[^>]*>([^<]*)<\/w>/) {
		my ($lemma, $OL) = ($1, $2);
		$lemma =~ s/[^\d"]*(\d+)[^\d"]*/$1/;
		$returnText .= "$OL <$lemma> "
	}
	return $returnText
}

__DATA__
01	gen	Genesis
02	exo	Exodus
03	lev	Leviticus
04	num	Numbers
05	deu	Deuteronomy
06	jos	Joshua
07	jdg	Judges
08	rut	Ruth
09	1sa	1 Samuel
10	2sa	2 Samuel
11	1ki	1 Kings
12	2ki	2 Kings
13	1ch	1 Chronicles
14	2ch	2 Chronicles
15	ezr	Ezra
16	neh	Nehemiah
17	est	Esther
18	job	Job
19	psa	Psalms
20	pro	Proverbs
21	ecc	Ecclesiastes
22	sng	Song of Solomon
23	isa	Isaiah
24	jer	Jeremiah
25	lam	Lamentations
26	ezk	Ezekiel
27	dan	Daniel
28	hos	Hosea
29	jol	Joel
30	amo	Amos
31	oba	Obadiah
32	jon	Jonah
33	mic	Micah
34	nam	Nahum
35	hab	Habakkuk
36	zep	Zephaniah
37	hag	Haggai
38	zec	Zechariah
39	mal	Malachi
41	mat	Matthew
42	mrk	Mark
43	luk	Luke
44	jhn	John
45	act	Acts
46	rom	Romans
47	1co	1 Corinthians
48	2co	2 Corinthians
49	gal	Galatians
50	eph	Ephesians
51	php	Philippians
52	col	Colossians
53	1th	1 Thessalonians
54	2th	2 Thessalonians
55	1ti	1 Timothy
56	2ti	2 Timothy
57	tit	Titus
58	phm	Philemon
59	heb	Hebrews
60	jas	James
61	1pe	1 Peter
62	2pe	2 Peter
63	1jn	1 John
64	2jn	2 John
65	3jn	3 John
66	jud	Jude
67	rev	Revelation
Now needed for tagging 2020-07-28 21:34:00 +00:00			`# Builds easily searchable file from current OGNT and MAST-HB XML file`
			`# Takes verse at a time from slurped file`
			`# Useful for Mine routine building MAST PDF`
			`use 5.18.0;`
			`use File::Slurp;`
			`use File::Find ;`
			`use Cwd ;`
			`use utf8;`
			`#use open IN => ":utf8", OUT => ":utf8";`
			`use open IO => ":utf8";`
			`open LOG, ">:utf8", "Logs/log.txt" or die;`
			`open OUT, ">:utf8", "Output/Original_languages.txt" or die;`

			`my (@folders) = ("/Users/Henry/Documents/WACS/MAST_HB", "/Users/Henry/Documents/WACS/OGNT");`
			`my (%order, %long);`
			`my $outText;`

			`while (<DATA>) {`
			`chomp;`
			`if (/^([^\t])\t([^\t])\t(.*)$/) {`
			`$order{$1} = $3;`
			`$long{$2} = $3;`
			`}`
			`}`
			`#foreach my $key (sort keys %long) {`
			`# say LOG $key . "\t" . $long{$key};`
			`#}`

			`foreach my $folder (@folders) {`
			`say LOG "$folder";`
			`#system "cd $folder;xml val *.xml;echo 'Continue? (Control + C to quit, Enter to continue)';read name;";`
			`my ($topDir, $lang) = ($folder, "H");`

			`if ($folder =~ /OGNT/) {$lang = "G"}`

			`my @filesToRun = ();`
			`my $filePattern = '*.xml' ;`
			`find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ;`
			`@filesToRun = sort @filesToRun;`
			`foreach my $file ( @filesToRun ) {`
			`say LOG $file;`
			`my $fileText = read_file("$file", binmode => 'utf8');`
			`my ($bk, $ch, $vs, $lemma, $word, $nbk, $nch, $nvs, $previous, $current, $interruption, $next, $verse, $thisBookText, $prevVsText, $holdText, $thisVsText, $nextVsText, $oldHold, $shortCur, $shortIntr);`
			`while ($fileText =~ /<verse osisID="((.?)\.(\d+)\.(\d+))"(\n\|.)?<\/verse>/spg) {`
			`$verse = $&;`
			`say LOG "\$1: $1, \$2: $2, \$3: $3, \$4: $4";`
			`($shortCur, $bk, $ch, $vs) = ($1, $long{$2}, $3, $4);`
			`$previous = $current;`
			`$current = "$bk $ch:$vs";`
			`say LOG "<0>\t\$current: $current";`
			`my $verseText;`

			`if ($verse =~ /<note>KJV:(([^\.])\.([^\.]).([^<]*))<\/note>/p) { # Occurs only in OT`
			`say LOG "<1>\t$&";`
			`($shortIntr, $nbk, $nch, $nvs) = ($1, $long{$2}, $3, $4);`
			`$interruption = "$nbk $nch:$nvs";`
			`say LOG "<2>\t\$interruption: $interruption (of $current)";`
			`if ($verse =~ /<verse osisID="$shortCur">\n[^<\n]*<note>KJV:$shortIntr<\/note>/) { # Complete renumber of verse`
			`say LOG "<3>\t$&";`
			`$current = $interruption;`
			`$verseText = GetContent($verse);`
			`$verseText = "$current\t$oldHold$verseText";`
			`$oldHold = "";`
			`}`
			`elsif ($interruption ne $current && $verse =~ /<note>KJV:([^\.])\.([^\.]).([^<]*)<\/note>/p) { # New verse begins here`
			`say LOG "<4>\t$&";`
			`($thisVsText, $nextVsText) = (${^PREMATCH}, ${^POSTMATCH});`
			`$thisVsText = GetContent($thisVsText);`
			`$nextVsText = GetContent($nextVsText);`
			`$outText .= "$oldHold\n$current\t$thisVsText ";`
			`$oldHold = "$nextVsText ";`
			`}`
			`elsif ($interruption eq $current && $verse =~ /<note>KJV:([^\.])\.([^\.]).([^<]*)<\/note>/p) { # Previous verse continues here`
			`say LOG "<5>\t$&";`
			`($prevVsText, $thisVsText) = (${^PREMATCH}, ${^POSTMATCH});`
			`$prevVsText = GetContent($prevVsText);`
			`$thisVsText = GetContent($thisVsText);`
			`$verseText .= "$oldHold\n$current\t$thisVsText";`
			`$oldHold = "";`
			`}`
			`}`
			`else {`
			`# The whole verse should be processed in one piece`
			`#$verseText = GetContent($verse);`
			`#$verseText = "$current\$tverseText"`
			`}`
			`#$thisBookText .= "\n$verseText";`
			`#$oldHold = $holdText`
			`}`
			`#$thisBookText =~ s/</<$lang/g;`
			`#$outText .= "$thisBookText\n";`
			`}`


			`}`

			`say OUT $outText;`

			`close OUT;`
			`close LOG;`

			`print "\n\tDone.";`

			`sub GetContent {`
			`my ($text, $returnText) = ($_[0], "");`
			`while ($text =~ /<w lemma="([^"])"[^>]>([^<]*)<\/w>/) {`
			`my ($lemma, $OL) = ($1, $2);`
			`$lemma =~ s/[^\d"](\d+)[^\d"]/$1/;`
			`$returnText .= "$OL <$lemma> "`
			`}`
			`return $returnText`
			`}`

			`__DATA__`
			`01 gen Genesis`
			`02 exo Exodus`
			`03 lev Leviticus`
			`04 num Numbers`
			`05 deu Deuteronomy`
			`06 jos Joshua`
			`07 jdg Judges`
			`08 rut Ruth`
			`09 1sa 1 Samuel`
			`10 2sa 2 Samuel`
			`11 1ki 1 Kings`
			`12 2ki 2 Kings`
			`13 1ch 1 Chronicles`
			`14 2ch 2 Chronicles`
			`15 ezr Ezra`
			`16 neh Nehemiah`
			`17 est Esther`
			`18 job Job`
			`19 psa Psalms`
			`20 pro Proverbs`
			`21 ecc Ecclesiastes`
			`22 sng Song of Solomon`
			`23 isa Isaiah`
			`24 jer Jeremiah`
			`25 lam Lamentations`
			`26 ezk Ezekiel`
			`27 dan Daniel`
			`28 hos Hosea`
			`29 jol Joel`
			`30 amo Amos`
			`31 oba Obadiah`
			`32 jon Jonah`
			`33 mic Micah`
			`34 nam Nahum`
			`35 hab Habakkuk`
			`36 zep Zephaniah`
			`37 hag Haggai`
			`38 zec Zechariah`
			`39 mal Malachi`
			`41 mat Matthew`
			`42 mrk Mark`
			`43 luk Luke`
			`44 jhn John`
			`45 act Acts`
			`46 rom Romans`
			`47 1co 1 Corinthians`
			`48 2co 2 Corinthians`
			`49 gal Galatians`
			`50 eph Ephesians`
			`51 php Philippians`
			`52 col Colossians`
			`53 1th 1 Thessalonians`
			`54 2th 2 Thessalonians`
			`55 1ti 1 Timothy`
			`56 2ti 2 Timothy`
			`57 tit Titus`
			`58 phm Philemon`
			`59 heb Hebrews`
			`60 jas James`
			`61 1pe 1 Peter`
			`62 2pe 2 Peter`
			`63 1jn 1 John`
			`64 2jn 2 John`
			`65 3jn 3 John`
			`66 jud Jude`
			`67 rev Revelation`