From 5851b15017e677dee7ab7cb799d6e89fdf6525cc Mon Sep 17 00:00:00 2001
From: Henry Whitney <henry_whitney@wycliffeassociates.org>
Date: Tue, 11 Aug 2020 17:52:39 -0400
Subject: [PATCH] Tagging ULB

---
 ...s_from_unified_ULB_XML_and_tWs_and_OGNT.pl | 435 ++++++++++++++++++
 1 file changed, 435 insertions(+)
 create mode 100644 Construct_auto-tagged_ULB_XML_files_from_unified_ULB_XML_and_tWs_and_OGNT.pl
diff --git a/Construct_auto-tagged_ULB_XML_files_from_unified_ULB_XML_and_tWs_and_OGNT.pl b/Construct_auto-tagged_ULB_XML_files_from_unified_ULB_XML_and_tWs_and_OGNT.pl
new file mode 100644
index 00000000..d501d5d4
--- /dev/null
+++ b/Construct_auto-tagged_ULB_XML_files_from_unified_ULB_XML_and_tWs_and_OGNT.pl
@@ -0,0 +1,435 @@
+# Takes current tW entries and populates tagged OGNT XML
+# 
+# This is the current best version
+#  Requires ULB that includes USFMs.
+
+# This version maintains the Greek word order to lessen reordering.
+
+
+use 5.12.0;
+use File::Slurp;
+use File::Find ; 
+use Cwd ; 
+use utf8;
+#use open IN => ":utf8", OUT => ":utf8";
+use open IO => ":utf8";
+$" = "\n";
+
+mkdir "Logs";
+open(LOG, ">Logs/tW_pairs.txt") or die "$!";
+my $ULBfile = "/Users/Henry/Documents/WACS/en_ulb_tagged/ULB_xml/ULB.xml";
+my $topDirOGNT = "/Users/Henry/Documents/WACS/OGNT";
+#my $topDirOGNT = "/Users/Henry/Documents/WACS/en_ulb_tagged/Tag_test";
+my $topDirtW = "/Users/Henry/Documents/WACS/en_tw/bible";
+my ($outDir, $outFile) = ("/Users/Henry/Documents/WACS/en_ulb_tagged/Auto-tagged", "");
+my ($ULBText, $workText, $language);
+my ($file);
+my (%ULBtextThisVerse, %ULBpreTextThisVerse, %SNsThisVerse, %entriesThisSN, %longName);
+
+my @OGNTfilesToRun = ();
+#my $filePattern = '\.xml' ;
+my $filePattern = '41-MAT\.xml' ;
+find( sub { push @OGNTfilesToRun, $File::Find::name  if ( m/^(.*)$filePattern$/ ) }, $topDirOGNT) ;
+
+say LOG "\@OGNTfilesToRun:\n@OGNTfilesToRun\n";
+
+my @tWfilesToRun = ();
+$filePattern = '.md' ;
+find( sub { push @tWfilesToRun, $File::Find::name  if ( m/^(.*)$filePattern$/ ) }, $topDirtW) ;
+
+Read_tW_Files();
+
+close LOG;
+open(LOG, ">Logs/Log.txt") or die "$!";
+
+LongBookNames();
+Prepare_ULB_file();
+say LOG "Prepare_ULB_file done.\n\@OGNTfilesToRun:\n@OGNTfilesToRun\n";
+
+ProcessXML();
+#	put unused SN at end of verse
+
+close LOG;
+
+say "\nDone.";
+# =====
+sub Read_tW_Files {
+	foreach  $file ( @tWfilesToRun  ) {
+		say LOG $file;
+		my (@sns);
+		my $entries;
+		my $fileText = read_file("$file", binmode => 'utf8');
+		if ($fileText =~ /\* Strong's: ([^\n]*)\n/) {
+			my $sns = $1;
+			#say LOG "\t$sns";
+			@sns = split /, /, $sns;
+		}
+		if ($fileText =~ /Forms Found in the English ULB:\n\n([^\n]*)\n/) {
+			$entries = $1;
+			die "$fileText" if $entries eq "";			
+			#say LOG "\t\t$entries"
+		}
+		foreach my $sn (@sns) {
+			$entriesThisSN{$sn} .= $entries . ", ";
+			#say LOG "\t\t\t$sn: $entriesThisSN{$sn}"
+		}
+	}
+	foreach my $sn (sort keys %entriesThisSN) {
+		#say LOG "$sn: $entriesThisSN{$sn}";
+		my @entries = split /, /, $entriesThisSN{$sn};
+		@entries = reverse sort {  substr($a,0,1) <=> substr($b,0,1)
+			|| length($a) <=> length($b)
+			|| $a <=> $b }
+		@entries;
+		$entriesThisSN{$sn} = "";
+		foreach my $slice (@entries) {
+			$entriesThisSN{$sn} .= "$slice, "
+		}
+		$entriesThisSN{$sn} =~ s/, $//;
+		say LOG "$sn: $entriesThisSN{$sn}";
+	}
+
+}
+sub LongBookNames {
+	while (<DATA>) {
+		chomp;
+		if (/([^\t]*)\t([^\t]*)\t([^\t]*)/) {
+			$longName{$2} = $3
+		}
+	}
+}
+sub ProcessXML {
+	foreach my $file (@OGNTfilesToRun) {
+		my $greekText;
+		my $fileGist;
+		if ($file =~ /((..)....\.xml)/) {
+			($fileGist, $language) = ($1, $2);
+			if ($language > 40) {
+				$language = "G"
+			} else {$language = "H"}
+			
+		}
+		say LOG "<0>\t$file \t$fileGist";
+		open(OUT, ">$outDir/$fileGist") or die "$outDir/$fileGist: $!";
+		my ($pre, $gist, $post, $bk, $ch, $vs, $thisVerse, $staticText, $residueText, $matchedLines, $flag, $thisVerseForOutput,
+			$linesWithRelevantSNs, $linesNotMatched, $orderedOutputLines, $linesToSkip, $thisPreText);
+		open (my $thisFile, "<:utf8", "$file") or die "$file:\n$!";
+		my ($originalLinesCount, $rsnCount, $skipCount, $noRSNCount, $outCount);
+		while (my $thisLine = <$thisFile>) {
+			chomp $thisLine;
+			if ($thisLine =~ /<verse/) {say LOG "\n=========================="}
+			if ($thisLine =~ /<\/verse>/) {
+				say LOG "<0.1>\t$thisLine";
+				say LOG "<11>\n\$linesWithRelevantSNs\n$linesWithRelevantSNs\n\$linesToSkip\n$linesToSkip\$residueText\n$residueText";
+				
+				($matchedLines, $residueText, $linesNotMatched) = ProcessRelevantSNs($linesWithRelevantSNs, $staticText, $residueText);
+				say LOG "<14>\t\$matchedLines\n$matchedLines\n\$linesNotMatched\n$linesNotMatched";
+				my %orderedLine;
+				$matchedLines =~ s/\n{2,}/\n/gs;
+				say LOG "<15\tBefore sort of \$matchedLines:\n$matchedLines\n";
+				while ($matchedLines =~ /([^◊]*)◊(\d*)\n/g) {
+					$orderedLine{$2} = $1;
+					say LOG "<5>\t\$2: $2\t\$1: $1";
+				}
+				$matchedLines = "";
+				foreach my $line (sort {$a <=> $b} keys %orderedLine) {
+					say LOG "<5.5>\t\$line: $line\t\$orderedLine{$line}: $orderedLine{$line}";
+					$matchedLines .= "$orderedLine{$line}\n"
+				}
+				chomp $matchedLines;
+				say LOG "<16>\tAfter sort of \$matchedLines:\n$matchedLines\n\$linesNotMatched\n$linesNotMatched";					
+				$residueText =~ s/<usfm>.*?<\/usfm>//g;
+				$residueText =~ s/(^q | q$)//g;
+				$residueText =~ s/ {3,}/  /g;
+				$residueText =~ s/^ +//;
+				$residueText =~ s/ +$/$1/;
+				$greekText =~ s/^ +//;
+				$greekText =~ s/ +$/$1/;
+				$staticText =~ s/^ +//;
+				$staticText =~ s/ +$/$1/;
+				my $internalUSFM;
+				$internalUSFM .= "\t\t\t\t\t$&\n" while ($staticText =~ /<usfm>.*?<\/usfm>/g);
+				$linesNotMatched =~ s/\n+$//;
+				$linesToSkip =~ s/\n+$//;
+				$matchedLines =~ s/^\n+//;
+				$internalUSFM =~ s/\n+$//;
+				$internalUSFM =~ s/\t{5,}/\t\t\t\t/g;
+				say LOG "<17>\tAfter pruning \$matchedLines:\n$matchedLines\n\$linesNotMatched\n$linesNotMatched";					
+				say OUT "\t\t\t\t<Greek>$greekText</Greek>";
+				say OUT "\t\t\t\t<preText>$thisPreText</preText>";
+				say OUT "\t\t\t\t<ULB>$staticText</ULB>";
+				say OUT "\t\t\t\t<residue>$residueText</residue>";
+				say LOG "\$matchedLines:\n$matchedLines	\$linesNotMatched:\n$linesNotMatched	\$linesToSkip:\n$linesToSkip	";
+				my $wordLines = RestoreGreekOrder($matchedLines, $linesNotMatched, $linesToSkip);
+				#say OUT "$matchedLines" unless ($matchedLines eq "");
+				#say OUT "$linesNotMatched" unless ($linesNotMatched eq "");
+				#say OUT "$internalUSFM" unless ($internalUSFM eq "");
+				#say OUT "$linesToSkip" if ($linesToSkip);
+				$wordLines =~ s/<w/\t\t\t\t<w/gs;
+				$thisLine = "$wordLines\n$internalUSFM\n$thisLine";
+				$thisLine =~ s/\n{2,}/\n/sg;
+				say OUT "$thisLine";
+				($originalLinesCount, $rsnCount, $skipCount, $noRSNCount, $outCount) = ();
+				($thisVerseForOutput, $flag, $workText, $greekText, $linesNotMatched, $linesToSkip, $residueText, $orderedOutputLines, $linesWithRelevantSNs) = ();
+				($linesToSkip) = ("");
+			}
+			elsif ($thisLine =~ /<w /) {
+				say LOG "<0.2>\t$thisLine";
+				$originalLinesCount ++;
+				if ($thisLine =~ />([^\n<>]*)</) {
+					$greekText .= $1 . " "
+				}
+				$thisLine =~ s/\t(<w .*)>([^<]*)(<\/w>)/$1 text="$2">$3/;
+				if ($thisLine =~ /lemma="(\d+)"/) {
+					my $thisLemma = $language . $1;
+					if (exists $entriesThisSN{$thisLemma}) {
+						$rsnCount ++;
+						$linesWithRelevantSNs .= $thisLine . "\n";
+						say LOG "<0.2.1>\t\$thisLemma: $thisLemma; line pushed to \$linesWithRelevantSNs";
+					}
+					else {
+						$skipCount ++;
+						$thisLine =~ s/><\/w>/>√<\/w>/;
+						$linesToSkip .= "$thisLine\n";
+						#say LOG "<0.2.2>\t\$thisLemma: $thisLemma; line pushed to \@LinesToSkip";
+					}
+				}
+			}
+			elsif ($thisLine =~ /<verse osisID="(.*?)\.(.*?)\.(.*?)">/) {
+				say LOG "<0.3>\t$thisLine";
+				($bk, $ch, $vs) = ($1,$2,$3);
+				($thisVerse, $greekText) = ("$longName{$bk} $ch:$vs", "");
+				$staticText = $ULBtextThisVerse{$thisVerse};
+				$residueText = "q $staticText q";
+				$thisPreText = $ULBpreTextThisVerse{$thisVerse};
+				say OUT "\t\t\t<verse name=\"$thisVerse\">";
+				($flag) = (1);
+			}
+			else {say OUT $thisLine}
+		}
+	
+		close $thisFile;
+		close OUT;
+	}
+}
+sub RestoreGreekOrder {
+	my ($matchedLines, $linesNotMatched, $linesToSkip) = (@_);
+	say LOG "<00>\t\$matchedLines: $matchedLines	\$linesNotMatched: $linesNotMatched	\$linesToSkip: $linesToSkip	";
+	my $wordsLine;
+	my %order;
+	foreach my $thisOne ($matchedLines, $linesNotMatched, $linesToSkip) {
+		say LOG "\$thisOne:\n$thisOne";
+		while ($thisOne =~ /(<w OGNTsort="([^"]*)"[^>]*>[^<]*<\/w>)/g) {
+			$order{$2} = "$1\n";
+		}
+	}
+	foreach my $line (sort keys %order) {
+		$wordsLine .= $order{$line}
+	}
+	say LOG "<01>\t\$wordsLine:>>\n$wordsLine<<";
+	return $wordsLine
+}
+sub ProcessRelevantSNs {
+	my ($relevantLines, $staticText, $residueText, $linesNotMatched) = (@_);
+	my ($matchedLines, $thisLine);
+	my @relevantLines = split /\n/, $relevantLines;
+	foreach my $line (@relevantLines) {
+		if ($line =~ /lemma="(\d+)"/) {
+			my $thisSN = $language . $1;
+			say LOG "<12>\t\$line: $line, \$thisSN: $thisSN, \$entriesThisSN{$thisSN}\n$entriesThisSN{$thisSN}";
+			($thisLine, $residueText, $linesNotMatched) = MatchAndPlace($line, $thisSN, $staticText, $residueText, $linesNotMatched);
+			$thisLine =~ s/[ \t]+$//;
+			$matchedLines .= $thisLine . "\n";
+			$matchedLines =~ s/\n{2,}$/\n/s;
+			say LOG "<13>\t\$matchedLines\n$matchedLines\n\$linesNotMatched\n$linesNotMatched+++"
+		}
+	}
+	return ($matchedLines, $residueText, $linesNotMatched);
+}
+sub MatchAndPlace {
+	my ($line, $sn, $staticText, $workText, $linesNotMatched) = @_;
+	#say LOG "<8>\t\$line: $line	 \$sn: $sn	 \$workText\n$workText";
+	my ($workEntry, $found, $matchedLines, $first, $second, $third, $firstLen, $secondLen, $thirdLen);
+	my @entries = split /, /, $entriesThisSN{$sn};
+	foreach my $entry (@entries) {
+		my $entryType;
+		if ($entry =~ /^(.*) \.\.\. (.*) \.\.\. (.*)$/) {
+			($first, $second, $third) = ($1, $2, $3);
+			($firstLen, $secondLen, $thirdLen) = (length $first, length $second, length $third);
+			$workEntry = "\\b" . $first . "\\b" . ".*?" . "\\b" . $second . "\\b" . ".*?" . "\\b" . $third;
+			say LOG "<1a>\t\$first: $first,	 \$second: $second,	 \$third: $third,	 \$firstLen: $firstLen,	 \$secondLen,: $secondLen,	 \$thirdLen: $thirdLen	\$entry: |$entry|\t\$workEntry: |$workEntry|";
+			$entryType = 1;
+		}
+		elsif ($entry =~ /^(.*) \.\.\. (.*)$/) {
+			($first, $second) = ($1, $2);
+			($firstLen, $secondLen) = (length $first, length $second);
+			$workEntry = "\\b" . $first . "\\b" . ".*?" . "\\b" . $second . "\\b";
+			say LOG "<2a>\t\$first: $first,	 \$second: $second,	 \$third: $third,	 \$firstLen: $firstLen,	 \$secondLen,: $secondLen,	 \$entry: |$entry|\t\$workEntry: |$workEntry|";
+			$entryType = 2;
+		}
+		else {$workEntry = $entry;}
+		
+		my $foundText;
+		#say LOG "<8.1>\t\$entryType: $entryType\t\$entry: $entry\t\$workEntry: $workEntry";
+		if ($workText =~ /\b$workEntry\b/p) {
+			say LOG "<8.1>Found: \t\$entryType: $entryType\t\$entry: $entry\t\$workEntry: $workEntry";
+				($foundText, $workText) = ($&, "${^PREMATCH}ı${^POSTMATCH}");
+				my ($place, $foundTextLength, $replacementSpaces) = (length ${^PREMATCH}, length $foundText, "");
+				$line =~ s/></>$entry</;
+				while (length $replacementSpaces < $foundTextLength) {$replacementSpaces .= " "}
+				
+				if ($entryType) {
+
+					say LOG "<8.2>\n\$workText,: $workText,	 \$matchedLines:\n$matchedLines	";
+					
+					($workText) = FixWorkText($line, $workText, $workEntry, $foundText, $foundTextLength, $first, $firstLen, $second, $secondLen, $third, $thirdLen);
+					
+					say LOG "<8.3>\n\$workText:\n$workText\n\$matchedLines:\n$matchedLines";
+
+				}
+
+				else {$workText =~ s/ı/$replacementSpaces/;}
+				
+				$matchedLines .= "$line◊$place";
+
+				say LOG "<8.4>\tAfter found, new \$workText:\n$workText";
+				$found = 1;
+			}
+		else {
+			#say LOG "\$workEntry $workEntry not found"
+		}
+		if ($found) {
+			last
+		}
+	}
+	unless ($found) {
+		$line =~ s/></>?</;
+		$linesNotMatched .= "$line\n"
+	}
+	return ($matchedLines, $workText, $linesNotMatched)
+}
+sub FixWorkText {
+	my ($thisLine, $text, $entry, $foundText, $foundTextLength, $first, $firstLen, $second, $secondLen, $third, $thirdLen) = @_;
+	my ($firstSpace, $secondSpace, $thirdSpace);
+	while (length $firstSpace < $firstLen) {$firstSpace .= " "}
+	while (length $secondSpace < $secondLen) {$secondSpace .= " "}
+	while (length $thirdSpace < $thirdLen) {$thirdSpace .= " "}
+	
+	say LOG "<9>\$text:\n$text\n\t\t\$entry: $entry	\$foundText: $foundText\t \$foundTextLength: $foundTextLength\t\$first: $first\t\$second: $second\t\$third: $third\n\$firstSpace: $firstSpace\t\$secondSpace: $secondSpace\t\$thirdSpace: $thirdSpace";
+	if ($third) {
+		if ($foundText =~ /$first(.*)$second(.*)$third/) {
+			my ($firstGap, $secondGap) = ($1, $2);
+			my $repText = "$firstSpace$firstGap$secondSpace$secondGap$thirdSpace";
+			say LOG "<9.1> \$repText: $repText";
+			$text =~ s/ı/$repText/;
+		}
+	}
+	else {
+		if ($foundText =~ /$first(.*)$second/) {
+			my ($firstGap) = ($1);
+			say LOG "<9.2>\t\$firstSpace: |$firstSpace|\t\$firstGap: |$firstGap|\t\$secondSpace: |$secondSpace|";
+			my $repText ="$firstSpace$firstGap$secondSpace";
+			say LOG "<9.3> \$repText: |$repText|";
+			$text =~ s/ı/$repText/;
+		}
+	}
+	return ($text)
+}
+sub Prepare_ULB_file {
+
+	my $thisVerse;
+	#$ULBText = read_file($ULBfile, binmode => 'utf8');
+
+	#while ($ULBText =~ /<verse name="(.*?)">\n<preText>(.*?)<\/preText>\n.*<text>(.*?)<\/text>.*<\/verse>/sg) {
+	#	($ULBtextThisVerse{$1}, $ULBpreTextThisVerse{$1}) = ($3, $2);
+	#}
+	#
+	
+	open (my $file, "<:utf8", "$ULBfile") or die "$ULBfile:\n$!";
+	
+		while (my $thisLine = <$file>) {
+			chomp $thisLine;
+			if ($thisLine =~ /verse name="(.*?)"/) {
+				$thisVerse = $1;
+				#say LOG "$thisVerse:\n$thisLine"
+			}
+			elsif ($thisLine =~ /<preText>(.*?)<\/preText>/) {
+				$ULBpreTextThisVerse{$thisVerse} = $1;
+				#say LOG "$thisVerse:\n$ULBpreTextThisVerse{$thisVerse}"
+			}
+			elsif ($thisLine =~ /<text>(.*?)<\/text>/) {
+				$ULBtextThisVerse{$thisVerse} = $1;
+				#say LOG "$thisVerse:\n$ULBtextThisVerse{$thisVerse}"
+			}
+		}
+	
+	close $file;
+}
+
+__DATA__
+01	gen	Genesis
+02	exo	Exodus
+03	lev	Leviticus
+04	num	Numbers
+05	deu	Deuteronomy
+06	jos	Joshua
+07	jdg	Judges
+08	rut	Ruth
+09	1sa	1 Samuel
+10	2sa	2 Samuel
+11	1ki	1 Kings
+12	2ki	2 Kings
+13	1ch	1 Chronicles
+14	2ch	2 Chronicles
+15	ezr	Ezra
+16	neh	Nehemiah
+17	est	Esther
+18	job	Job
+19	psa	Psalms
+20	pro	Proverbs
+21	ecc	Ecclesiastes
+22	sng	Song of Solomon
+23	isa	Isaiah
+24	jer	Jeremiah
+25	lam	Lamentations
+26	ezk	Ezekiel
+27	dan	Daniel
+28	hos	Hosea
+29	jol	Joel
+30	amo	Amos
+31	oba	Obadiah
+32	jon	Jonah
+33	mic	Micah
+34	nam	Nahum
+35	hab	Habakkuk
+36	zep	Zephaniah
+37	hag	Haggai
+38	zec	Zechariah
+39	mal	Malachi
+41	mat	Matthew
+42	mrk	Mark
+43	luk	Luke
+44	jhn	John
+45	act	Acts
+46	rom	Romans
+47	1co	1 Corinthians
+48	2co	2 Corinthians
+49	gal	Galatians
+50	eph	Ephesians
+51	php	Philippians
+52	col	Colossians
+53	1th	1 Thessalonians
+54	2th	2 Thessalonians
+55	1ti	1 Timothy
+56	2ti	2 Timothy
+57	tit	Titus
+58	phm	Philemon
+59	heb	Hebrews
+60	jas	James
+61	1pe	1 Peter
+62	2pe	2 Peter
+63	1jn	1 John
+64	2jn	2 John
+65	3jn	3 John
+66	jud	Jude
+67	rev	Revelation