Work on MAST PDF

2020-07-24 16:50:00 -04:00 · 2020-07-24 16:50:00 -04:00 · 265e05de6d
parent e2b88b6805
commit 265e05de6d
7 changed files with 101 additions and 955 deletions
--- a/MAST_tW_PDF_Updater/FilesForUpdates/Build_OL_files_from_XML.pl
+++ b/MAST_tW_PDF_Updater/FilesForUpdates/Build_OL_files_from_XML.pl
@ -1,5 +1,7 @@
-# Builds easily searchable files from current OGNT and MAST-HB XML files
-use 5.12.0;
+# Builds easily searchable files from current OGNT and MAST-HB XML file
+# Takes verse at a time from slurped file
+use 5.18.0;
+use File::Slurp;
 use File::Find ; 
 use Cwd ; 
 use utf8;
@ -22,6 +24,8 @@ while (<DATA>) {

 foreach my $folder (@folders) {
 	say "$folder";
+	#system "cd $folder;xml val *.xml;echo 'Continue? (Control + C to quit, Enter to continue)';read name;";
+
 	my $topDir = $folder;

 	my @filesToRun = ();
@ -30,24 +34,24 @@ foreach my $folder (@folders) {
 	@filesToRun = sort @filesToRun;
 	foreach  my $file ( @filesToRun  ) {
 	   say $file;
-	   open (IN, $file) or die "$!";
-	   while (<IN>) {
-	   		my ($bk, $ch, $vs, $lemma, $word);
-	   		chomp;
-	   		if (/<verse osisID="(.*)\.(\d+)\.(\d+)">/) {
-	   			($bk, $ch, $vs) = ($long{$1}, $2, $3);
-	   			$outText .= "\n$bk $ch:$vs\t"
-	   		} elsif (/<w lemma="([^"]*)" morph=".*" lexeme=".*">(.*)<\/w>/ && not /<note type="variant">/) {
-	   			($lemma, $word) = ($1, $2);
-				$lemma =~ s/^[^\d]*(\d{1,4})[^\d]*$/G$1/;
-	   			$outText .= "$word <$lemma> "
-	   		} elsif (/<w lemma="([^"]*)" (n="[^"]*" )?morph="[^"]*" id="[^"]*">([^<]*)<\/w>/ && not /<note type="variant">/) {
-	   			($lemma, $word) = ($1, $3);
-				$lemma =~ s/^[^\d]*(\d{1,4})[^\d]*$/H$1/;
-				$word =~ s/\///g;
-	   			$outText .= "$word <$lemma> "
-	   		}
+	   my $fileText = read_file("$file", binmode => 'utf8');
+	   my ($bk, $ch, $vs, $lemma, $word, $nbk, $nch, $nvs, $previous, $current, $interruption, $verse);
+	   while ($fileText =~ /<verse osisID="(.*)\.(\d+)\.(\d+)".*?<\/verse>/spg) {
+			$verse = $&;
+			($bk, $ch, $vs) = ($long{$1}, $2, $3);
+			$previous = $current;
+			$current = "$bk $ch:$vs";
+   			if ($verse =~ /<note>KJV:(.*)\.(.*).(.*)<\/note>/) {
+				($nbk, $nch, $nvs) = ($long{$1}, $2, $3);
+				$interruption = "$nbk $nch:$nvs";
+				if ($interruption ne $current) {
+					$current = $interruption;
+		   			$outText .= "\n$current\t"
+				}
+			$outText .= "\n$current\t"
+			}
 	   }
+	   
 	} 


--- a/MAST_tW_PDF_Updater/FilesForUpdates/Buld_MAST_OGNT_from_csv.pl
+++ b/MAST_tW_PDF_Updater/FilesForUpdates/Buld_MAST_OGNT_from_csv.pl
@ -34,7 +34,7 @@ while (<IN>) {
 	Separate();
 }

-say OUT "				</verse>\n			</chapter>\n		</div>\n	</book>\n</xml>";
+say OUT "			</verse>\n		</chapter>\n	</div>\n</xml>";

 say "Closing input and output files ...";

@ -54,20 +54,20 @@ sub Separate {
 			my ($this_bk) = ($bk{$bn});
 			$bklc = lc $bk{$bn};
 			if (OUT-> opened()) {
-				say OUT "				</verse>\n			</chapter>\n		</div>\n	</book>\n</xml>";
+				say OUT "			</verse>\n		</chapter>\n	</book>\n</xml>";
 				close OUT;
 			}
 			open OUT, ">:utf8", "OGNT_for_tagging/$bn-$bk{$bn}.xml" or die "$! $bn-$bk{$bn}.xml";
-			say OUT "\n<xml>\n	<book>\n		<div type=\"book\" osisID=\"$bklc\">\n			<chapter osisID=\"$bklc.$ch\">\n				<verse osisID=\"$bklc.$ch.$vs\">";
+			say OUT "\n<xml>\n	<div type=\"book\" osisID=\"$bklc\">\n		<chapter osisID=\"$bklc.$ch\">\n			<verse osisID=\"$bklc.$ch.$vs\">";
 			($last_bn, $last_ch, $last_vs) = ($bn, $ch, $vs)
 		}
 		elsif ($ch ne $last_ch) {
-			say OUT "				</verse>\n			</chapter>\n			<chapter osisID=\"$bklc.$ch\">\n				<verse osisID=\"$bklc.$ch.$vs\">";
+			say OUT "			</verse>\n		</chapter>\n		<chapter osisID=\"$bklc.$ch\">\n			<verse osisID=\"$bklc.$ch.$vs\">";
 			($last_ch, $last_vs) = ($ch, $vs)
 		}
 		elsif ($vs ne $last_vs) {
 			my ($this_bk, $bklc) = ($bk{$bn}, lc $bk{$bn});
-			say OUT "				</verse>\n				<verse osisID=\"$bklc.$ch.$vs\">";
+			say OUT "			</verse>\n			<verse osisID=\"$bklc.$ch.$vs\">";
 			$last_vs = $vs;
 		}
 	say OUT "\t\t\t\t\t<w OGNTsort=\"$OGNTSort\" ULBorder=\"---\" lemma=\"G$sn\" morph=\"$gram\" lexeme=\"$lexeme\">$word</w>"
--- a/MAST_tW_PDF_Updater/FilesForUpdates/Convert_OHSB_to_MAST_HB.bad.pl
+++ b/MAST_tW_PDF_Updater/FilesForUpdates/Convert_OHSB_to_MAST_HB.bad.pl
@ -1,102 +0,0 @@
-#!/usr/bin/perl
-use warnings;
-use strict;
-
-use autodie;
-use File::Copy;
-
-my %filenames;
-
-while (<DATA>) {
-		chomp;
-	if (/([^\t]*)\t([^\t]*)/) {
-		#($oldName, $newName) = ($1, $2);
-		$filenames{$1} = $2;
-	}
-}
-
-
-# capture script name, in case we are running the script from the 
-# same directory we working on. 
-my $this_file = (split(/\//, $0))[-1];
-print "skipping file: $this_file\n";
-
-my $oldnames = "/home/henry/Documents/WA_Repo/OSHB";
-my $newnames = "/home/henry/Documents/WA_Repo/MAST_HB";
-
-# open the directory
-opendir(my $dh, $oldnames); 
-
-# grep out all directories and possibly this script. 
-my @files_to_rename = grep { !-d && $_ ne $this_file } readdir $dh;
-closedir $dh;
-
-### UPDATED ###
-# create hash of file names from lists:
-my @missing_new_file = ();  
-
-
-# change directory, so we don't have to worry about pathing
-# of files to rename and move... 
-chdir($oldnames);
-mkdir($newnames) if !-e $newnames;
-
-
-### UPDATED ###
-for my $file (@files_to_rename) {
-    # Check that current file exists in the hash,
-    # if true, copy old file to new location with new name
-    if( exists($filenames->{$file}) ) { 
-        copy($file, "$newnames/$filenames->{$file}");
-    } else {
-        push @missing_new_file, $file;
-    } 
-}
-
-
-if( @missing_new_file ) { 
-    print "Could not map files:\n", 
-        join("\n", @missing_new_file), "\n";
-}
-
-
-__DATA__
-Gen.xml	01-GEN.xml
-Exod.xml	02-EXO.xml
-Lev.xml	03-LEV.xml
-Num.xml	04-NUM.xml
-Deut.xml	05-DEU.xml
-Josh.xml	06-JOS.xml
-Judg.xml	07-JDG.xml
-Ruth.xml	08-RUT.xml
-1Sam.xml	09-1SA.xml
-2Sam.xml	10-2SA.xml
-1Kgs.xml	11-1KI.xml
-2Kgs.xml	12-2KI.xml
-1Chr.xml	13-1CH.xml
-2Chr.xml	14-2CH.xml
-Ezra.xml	15-EZR.xml
-Neh.xml	16-NEH.xml
-Esth.xml	17-EST.xml
-Job.xml	18-JOB.xml
-Ps.xml	19-PSA.xml
-Prov.xml	20-PRO.xml
-Eccl.xml	21-ECC.xml
-Song.xml	22-SNG.xml
-Isa.xml	23-ISA.xml
-Jer.xml	24-JER.xml
-Lam.xml	25-LAM.xml
-Ezek.xml	26-EZK.xml
-Dan.xml	27-DAN.xml
-Hos.xml	28-HOS.xml
-Joel.xml	29-JOL.xml
-Amos.xml	30-AMO.xml
-Obad.xml	31-OBA.xml
-Jonah.xml	32-JON.xml
-Mic.xml	33-MIC.xml
-Nah.xml	34-NAM.xml
-Hab.xml	35-HAB.xml
-Zeph.xml	36-ZEP.xml
-Hag.xml	37-HAG.xml
-Zech.xml	38-ZEC.xml
-Mal.xml	39-MAL.xml
--- a/MAST_tW_PDF_Updater/FilesForUpdates/Convert_OHSB_to_MAST_HB.old.pl
+++ b/MAST_tW_PDF_Updater/FilesForUpdates/Convert_OHSB_to_MAST_HB.old.pl
@ -1,63 +0,0 @@
-# Step 1 in updating OSHB to MAST_HB
-# Converts file names to WA MAST abbreviations
-# After this  need to run Convert_refs_in_MAST_HB.pl to get data in file to conform
-
-while (<DATA>) {
-		chomp;
-	if (/([^\t]*)\t([^\t]*)/) {
-		#($oldName, $newName) = ($1, $2);
-		$new_name{$1} = $2;
-	}
-}
-
-chdir = "/home/henry/Documents/WA_Repo/MAST_HB";
-
-foreach $old_name (sort keys %new_name) {
-	print "$old_name to $new_name{$old_name}\n";
-	system "mv $old_name $new_name{$old_name}"
-	#system "mv $olddir/$old_name to $newdir/$new_name{$old_name}"
-
-}
-
-print "Done.\n"
-
-__DATA__
-Gen.xml	01-GEN.xml
-Exod.xml	02-EXO.xml
-Lev.xml	03-LEV.xml
-Num.xml	04-NUM.xml
-Deut.xml	05-DEU.xml
-Josh.xml	06-JOS.xml
-Judg.xml	07-JDG.xml
-Ruth.xml	08-RUT.xml
-1Sam.xml	09-1SA.xml
-2Sam.xml	10-2SA.xml
-1Kgs.xml	11-1KI.xml
-2Kgs.xml	12-2KI.xml
-1Chr.xml	13-1CH.xml
-2Chr.xml	14-2CH.xml
-Ezra.xml	15-EZR.xml
-Neh.xml	16-NEH.xml
-Esth.xml	17-EST.xml
-Job.xml	18-JOB.xml
-Ps.xml	19-PSA.xml
-Prov.xml	20-PRO.xml
-Eccl.xml	21-ECC.xml
-Song.xml	22-SNG.xml
-Isa.xml	23-ISA.xml
-Jer.xml	24-JER.xml
-Lam.xml	25-LAM.xml
-Ezek.xml	26-EZK.xml
-Dan.xml	27-DAN.xml
-Hos.xml	28-HOS.xml
-Joel.xml	29-JOL.xml
-Amos.xml	30-AMO.xml
-Obad.xml	31-OBA.xml
-Jonah.xml	32-JON.xml
-Mic.xml	33-MIC.xml
-Nah.xml	34-NAM.xml
-Hab.xml	35-HAB.xml
-Zeph.xml	36-ZEP.xml
-Hag.xml	37-HAG.xml
-Zech.xml	38-ZEC.xml
-Mal.xml	39-MAL.xml
--- a/MAST_tW_PDF_Updater/FilesForUpdates/Convert_OHSB_to_MAST_HB.pl
+++ b/MAST_tW_PDF_Updater/FilesForUpdates/Convert_OHSB_to_MAST_HB.pl
@ -1,66 +0,0 @@
-# Step 1 in updating OSHB to MAST_HB
-# Converts file names to WA MAST abbreviations
-# After this  need to run Convert_refs_in_MAST_HB.pl to get data in file to conform
-
-while (<DATA>) {
-		chomp;
-	if (/([^\t]*)\t([^\t]*)/) {
-		#($oldName, $newName) = ($1, $2);
-		$new_name{$1} = $2;
-	}
-}
-
-my $oldnames = "/Users/virginiawhitney/Documents/Henry/WA/Repos/OSHB";
-my $newnames = "/Users/virginiawhitney/Documents/Henry/WA/Repos/MAST_HB";
-chdir($oldnames);
-mkdir($newnames) if !-e $newnames;
-
-foreach $old_name (sort keys %new_name) {
-	print "$old_name to $newnames/$new_name{$old_name}\n";
-	system "mv $old_name $newnames/$new_name{$old_name}"
-	#system "mv $olddir/$old_name to $newdir/$new_name{$old_name}"
-
-}
-
-print "Done.\n"
-
-__DATA__
-Gen.xml	01-GEN.xml
-Exod.xml	02-EXO.xml
-Lev.xml	03-LEV.xml
-Num.xml	04-NUM.xml
-Deut.xml	05-DEU.xml
-Josh.xml	06-JOS.xml
-Judg.xml	07-JDG.xml
-Ruth.xml	08-RUT.xml
-1Sam.xml	09-1SA.xml
-2Sam.xml	10-2SA.xml
-1Kgs.xml	11-1KI.xml
-2Kgs.xml	12-2KI.xml
-1Chr.xml	13-1CH.xml
-2Chr.xml	14-2CH.xml
-Ezra.xml	15-EZR.xml
-Neh.xml	16-NEH.xml
-Esth.xml	17-EST.xml
-Job.xml	18-JOB.xml
-Ps.xml	19-PSA.xml
-Prov.xml	20-PRO.xml
-Eccl.xml	21-ECC.xml
-Song.xml	22-SNG.xml
-Isa.xml	23-ISA.xml
-Jer.xml	24-JER.xml
-Lam.xml	25-LAM.xml
-Ezek.xml	26-EZK.xml
-Dan.xml	27-DAN.xml
-Hos.xml	28-HOS.xml
-Joel.xml	29-JOL.xml
-Amos.xml	30-AMO.xml
-Obad.xml	31-OBA.xml
-Jonah.xml	32-JON.xml
-Mic.xml	33-MIC.xml
-Nah.xml	34-NAM.xml
-Hab.xml	35-HAB.xml
-Zeph.xml	36-ZEP.xml
-Hag.xml	37-HAG.xml
-Zech.xml	38-ZEC.xml
-Mal.xml	39-MAL.xml
--- a/MAST_tW_PDF_Updater/FilesForUpdates/Convert_refs_in_MAST_HB.pl
+++ b/MAST_tW_PDF_Updater/FilesForUpdates/Convert_refs_in_MAST_HB.pl
@ -8,7 +8,7 @@ use utf8;
 use open IO => ":utf8";

 my ($oldName, $newName);
-my $topDir = "/Users/virginiawhitney/Documents/Henry/WA/Repos/MAST_HB";
+my $topDir = "/Users/Henry/Documents/WACS/MAST_HB";
 #my $topDir = "/Users/Henry/Documents/WACS/MAST_HB";
 my (%new_name);

@ -16,7 +16,7 @@ while (<DATA>) {
 	chomp;
 	if (/([^\t]*)\t([^\t]*)/) {
 		($oldName, $newName) = ($1, lc $2);
-		$new_name{$1} = lc $2
+		$new_name{$oldName} = $newName
 	}
 }

@ -31,12 +31,13 @@ foreach  my $file ( @filesToRun  ) {
 	foreach my $key (sort keys %new_name) {
 		$fileText =~ s/(osisID=")$key(\.\d+\.\d+")/$1$new_name{$key}$2/g;
 		$fileText =~ s/(<note>KJV:)$key(\.\d+\.\d+<\/note>)/$1$new_name{$key}$2/g;
+		$fileText =~ s/(<\/w>)\n          <seg/$1<seg/g;
 	}
 	open(OUT, ">:utf8", "$file") or die "$file:\n$!";
 	say OUT $fileText;
 	close OUT;
 } 
-print "Done.\n"
+print "Done.\n\nBe sure to validate XML by running\nxml val *.xml\nin both OGNT and MAST-HB directories before running\nBuild_OL_files_from_XML.pl\n"

 __DATA__
 Gen	GEN
--- a/MAST_tW_PDF_Updater/FilesForUpdates/Exceptions/Exceptions.txt
+++ b/MAST_tW_PDF_Updater/FilesForUpdates/Exceptions/Exceptions.txt