en_ulb_tagged/Construct_auto-tagged_ULB_X...

# Takes current tW entries and populates tagged OGNT XML
#
# This is the current best version
#  Requires ULB that includes USFMs.

# This version maintains the Greek word order to lessen reordering.


use 5.12.0;
use File::Slurp;
use File::Find ;
use Cwd ;
use utf8;
#use open IN => ":utf8", OUT => ":utf8";
use open IO => ":utf8";
$" = "\n";

mkdir "Logs";
open(LOG, ">Logs/tW_pairs.txt") or die "$!";
my $ULBfile = "/Users/dillardfam/Documents/WACS/fork/ULB_xml/ULB.xml";
my $topDirOGNT = "/Users/dillardfam/Documents/WACS/OGNT";
#my $topDirOGNT = "/Users/dillardfam/Documents/WACS/fork/Tag_test";
my $topDirtW = "/Users/dillardfam/Documents/WACS/en_tw/bible";
my ($outDir, $outFile) = ("/Users/dillardfam/Documents/WACS/fork/Auto-tagged", "");
my ($ULBText, $workText, $language);
my ($file);
my (%ULBtextThisVerse, %ULBpreTextThisVerse, %SNsThisVerse, %entriesThisSN, %longName);

my @OGNTfilesToRun = ();
#my $filePattern = '\.xml' ;
my $filePattern = '43-LUK\.xml' ;
find( sub { push @OGNTfilesToRun, $File::Find::name  if ( m/^(.*)$filePattern$/ ) }, $topDirOGNT) ;

say LOG "\@OGNTfilesToRun:\n@OGNTfilesToRun\n";

my @tWfilesToRun = ();
$filePattern = '.md' ;
find( sub { push @tWfilesToRun, $File::Find::name  if ( m/^(.*)$filePattern$/ ) }, $topDirtW) ;

Read_tW_Files();

close LOG;
open(LOG, ">Logs/Log.txt") or die "$!";

LongBookNames();
Prepare_ULB_file();
say LOG "Prepare_ULB_file done.\n\@OGNTfilesToRun:\n@OGNTfilesToRun\n";

ProcessXML();
#	put unused SN at end of verse

close LOG;

say "\nDone.";
# =====
sub Read_tW_Files {
	foreach  $file ( @tWfilesToRun  ) {
		say LOG $file;
		my (@sns);
		my $entries;
		my $fileText = read_file("$file", binmode => 'utf8');
		if ($fileText =~ /\* Strong's: ([^\n]*)\n/) {
			my $sns = $1;
			#say LOG "\t$sns";
			@sns = split /, /, $sns;
		}
		if ($fileText =~ /Forms Found in the English ULB:\n\n([^\n]*)\n/) {
			$entries = $1;
			die "$fileText" if $entries eq "";
			#say LOG "\t\t$entries"
		}
		foreach my $sn (@sns) {
			$entriesThisSN{$sn} .= $entries . ", ";
			#say LOG "\t\t\t$sn: $entriesThisSN{$sn}"
		}
	}
	foreach my $sn (sort keys %entriesThisSN) {
		#say LOG "$sn: $entriesThisSN{$sn}";
		my @entries = split /, /, $entriesThisSN{$sn};
		@entries = reverse sort {  substr($a,0,1) <=> substr($b,0,1)
			|| length($a) <=> length($b)
			|| $a <=> $b }
		@entries;
		$entriesThisSN{$sn} = "";
		foreach my $slice (@entries) {
			$entriesThisSN{$sn} .= "$slice, "
		}
		$entriesThisSN{$sn} =~ s/, $//;
		say LOG "$sn: $entriesThisSN{$sn}";
	}

}
sub LongBookNames {
	while (<DATA>) {
		chomp;
		if (/^([^#][^\t]*)\t([^\t]*)\t([^\t]*)/) {
			$longName{$2} = $3
		}
	}
}
sub ProcessXML {
	foreach my $file (@OGNTfilesToRun) {
		my $greekText;
		my $fileGist;
		if ($file =~ /((..)....\.xml)/) {
			($fileGist, $language) = ($1, $2);
			if ($language > 40) {
				$language = "G"
			} else {$language = "H"}

		}
		say LOG "<0>\t$file \t$fileGist";
		open(OUT, ">$outDir/$fileGist") or die "$outDir/$fileGist: $!";
		my ($pre, $gist, $post, $bk, $ch, $vs, $thisVerse, $staticText, $residueText, $matchedLines, $flag, $thisVerseForOutput,
			$linesWithRelevantSNs, $linesNotMatched, $orderedOutputLines, $linesToSkip, $thisPreText);
		open (my $thisFile, "<:utf8", "$file") or die "$file:\n$!";
		my ($originalLinesCount, $rsnCount, $skipCount, $noRSNCount, $outCount);
		while (my $thisLine = <$thisFile>) {
			chomp $thisLine;
			if ($thisLine =~ /<verse/) {say LOG "\n=========================="}
			if ($thisLine =~ /<\/verse>/) {
				say LOG "<0.1>\t$thisLine";
				say LOG "<11>\n\$linesWithRelevantSNs\n$linesWithRelevantSNs\n\$linesToSkip\n$linesToSkip\$residueText\n$residueText";

				($matchedLines, $residueText, $linesNotMatched) = ProcessRelevantSNs($linesWithRelevantSNs, $staticText, $residueText);
				say LOG "<14>\t\$matchedLines\n$matchedLines\n\$linesNotMatched\n$linesNotMatched";
				my %orderedLine;
				$matchedLines =~ s/\n{2,}/\n/gs;
				say LOG "<15\tBefore sort of \$matchedLines:\n$matchedLines\n";
				while ($matchedLines =~ /([^◊]*)◊(\d*)\n/g) {
					$orderedLine{$2} = $1;
					say LOG "<5>\t\$2: $2\t\$1: $1";
				}
				$matchedLines = "";
				foreach my $line (sort {$a <=> $b} keys %orderedLine) {
					say LOG "<5.5>\t\$line: $line\t\$orderedLine{$line}: $orderedLine{$line}";
					$matchedLines .= "$orderedLine{$line}\n"
				}
				chomp $matchedLines;
				say LOG "<16>\tAfter sort of \$matchedLines:\n$matchedLines\n\$linesNotMatched\n$linesNotMatched";
				$residueText =~ s/<usfm>.*?<\/usfm>//g;
				$residueText =~ s/(^q | q$)//g;
				$residueText =~ s/ {3,}/  /g;
				$residueText =~ s/^ +//;
				$residueText =~ s/ +$/$1/;
				$greekText =~ s/^ +//;
				$greekText =~ s/ +$/$1/;
				$staticText =~ s/^ +//;
				$staticText =~ s/ +$/$1/;
				my $internalUSFM;
				$internalUSFM .= "\t\t\t\t\t$&\n" while ($staticText =~ /<usfm>.*?<\/usfm>/g);
				$linesNotMatched =~ s/\n+$//;
				$linesToSkip =~ s/\n+$//;
				$matchedLines =~ s/^\n+//;
				$internalUSFM =~ s/\n+$//;
				$internalUSFM =~ s/\t{5,}/\t\t\t\t/g;
				say LOG "<17>\tAfter pruning \$matchedLines:\n$matchedLines\n\$linesNotMatched\n$linesNotMatched";
				say OUT "\t\t\t\t<Greek>$greekText</Greek>";
				say OUT "\t\t\t\t<preText>$thisPreText</preText>";
				say OUT "\t\t\t\t<ULB>$staticText</ULB>";
				say OUT "\t\t\t\t<residue>$residueText</residue>";
				say LOG "\$matchedLines:\n$matchedLines	\$linesNotMatched:\n$linesNotMatched	\$linesToSkip:\n$linesToSkip	";
				my $wordLines = RestoreGreekOrder($matchedLines, $linesNotMatched, $linesToSkip);
				#say OUT "$matchedLines" unless ($matchedLines eq "");
				#say OUT "$linesNotMatched" unless ($linesNotMatched eq "");
				#say OUT "$internalUSFM" unless ($internalUSFM eq "");
				#say OUT "$linesToSkip" if ($linesToSkip);
				$wordLines =~ s/<w/\t\t\t\t<w/gs;
				$thisLine = "$wordLines\n$internalUSFM\n$thisLine";
				$thisLine =~ s/\n{2,}/\n/sg;
				say OUT "$thisLine";
				($originalLinesCount, $rsnCount, $skipCount, $noRSNCount, $outCount) = ();
				($thisVerseForOutput, $flag, $workText, $greekText, $linesNotMatched, $linesToSkip, $residueText, $orderedOutputLines, $linesWithRelevantSNs) = ();
				($linesToSkip) = ("");
			}
			elsif ($thisLine =~ /<w /) {
				say LOG "<0.2>\t$thisLine";
				$originalLinesCount ++;
				if ($thisLine =~ />([^\n<>]*)</) {
					$greekText .= $1 . " "
				}
				$thisLine =~ s/\t(<w .*)>([^<]*)(<\/w>)/$1 text="$2">$3/;
				if ($thisLine =~ /lemma="(\d+)"/) {
					my $thisLemma = $language . $1;
					if (exists $entriesThisSN{$thisLemma}) {
						$rsnCount ++;
						$linesWithRelevantSNs .= $thisLine . "\n";
						say LOG "<0.2.1>\t\$thisLemma: $thisLemma; line pushed to \$linesWithRelevantSNs";
					}
					else {
						$skipCount ++;
						$thisLine =~ s/><\/w>/>√<\/w>/;
						$linesToSkip .= "$thisLine\n";
						#say LOG "<0.2.2>\t\$thisLemma: $thisLemma; line pushed to \@LinesToSkip";
					}
				}
			}
			elsif ($thisLine =~ /<verse osisID="(.*?)\.(.*?)\.(.*?)">/) {
				say LOG "<0.3>\t$thisLine";
				($bk, $ch, $vs) = ($1,$2,$3);
				($thisVerse, $greekText) = ("$longName{$bk} $ch:$vs", "");
				$staticText = $ULBtextThisVerse{$thisVerse};
				$residueText = "q $staticText q";
				$thisPreText = $ULBpreTextThisVerse{$thisVerse};
				say OUT "\t\t\t<verse name=\"$thisVerse\">";
				($flag) = (1);
			}
			else {say OUT $thisLine}
		}

		close $thisFile;
		close OUT;
	}
}
sub RestoreGreekOrder {
	my ($matchedLines, $linesNotMatched, $linesToSkip) = (@_);
	say LOG "<00>\t\$matchedLines: $matchedLines	\$linesNotMatched: $linesNotMatched	\$linesToSkip: $linesToSkip	";
	my $wordsLine;
	my %order;
	foreach my $thisOne ($matchedLines, $linesNotMatched, $linesToSkip) {
		say LOG "\$thisOne:\n$thisOne";
		while ($thisOne =~ /(<w OGNTsort="([^"]*)"[^>]*>[^<]*<\/w>)/g) {
			$order{$2} = "$1\n";
		}
	}
	foreach my $line (sort keys %order) {
		$wordsLine .= $order{$line}
	}
	say LOG "<01>\t\$wordsLine:>>\n$wordsLine<<";
	return $wordsLine
}
sub ProcessRelevantSNs {
	my ($relevantLines, $staticText, $residueText, $linesNotMatched) = (@_);
	my ($matchedLines, $thisLine);
	my @relevantLines = split /\n/, $relevantLines;
	foreach my $line (@relevantLines) {
		if ($line =~ /lemma="(\d+)"/) {
			my $thisSN = $language . $1;
			say LOG "<12>\t\$line: $line, \$thisSN: $thisSN, \$entriesThisSN{$thisSN}\n$entriesThisSN{$thisSN}";
			($thisLine, $residueText, $linesNotMatched) = MatchAndPlace($line, $thisSN, $staticText, $residueText, $linesNotMatched);
			$thisLine =~ s/[ \t]+$//;
			$matchedLines .= $thisLine . "\n";
			$matchedLines =~ s/\n{2,}$/\n/s;
			say LOG "<13>\t\$matchedLines\n$matchedLines\n\$linesNotMatched\n$linesNotMatched+++"
		}
	}
	return ($matchedLines, $residueText, $linesNotMatched);
}
sub MatchAndPlace {
	my ($line, $sn, $staticText, $workText, $linesNotMatched) = @_;
	#say LOG "<8>\t\$line: $line	 \$sn: $sn	 \$workText\n$workText";
	my ($workEntry, $found, $matchedLines, $first, $second, $third, $firstLen, $secondLen, $thirdLen);
	my @entries = split /, /, $entriesThisSN{$sn};
	foreach my $entry (@entries) {
		my $entryType;
		if ($entry =~ /^(.*) \.\.\. (.*) \.\.\. (.*)$/) {
			($first, $second, $third) = ($1, $2, $3);
			($firstLen, $secondLen, $thirdLen) = (length $first, length $second, length $third);
			$workEntry = "\\b" . $first . "\\b" . ".*?" . "\\b" . $second . "\\b" . ".*?" . "\\b" . $third;
			say LOG "<1a>\t\$first: $first,	 \$second: $second,	 \$third: $third,	 \$firstLen: $firstLen,	 \$secondLen,: $secondLen,	 \$thirdLen: $thirdLen	\$entry: |$entry|\t\$workEntry: |$workEntry|";
			$entryType = 1;
		}
		elsif ($entry =~ /^(.*) \.\.\. (.*)$/) {
			($first, $second) = ($1, $2);
			($firstLen, $secondLen) = (length $first, length $second);
			$workEntry = "\\b" . $first . "\\b" . ".*?" . "\\b" . $second . "\\b";
			say LOG "<2a>\t\$first: $first,	 \$second: $second,	 \$third: $third,	 \$firstLen: $firstLen,	 \$secondLen,: $secondLen,	 \$entry: |$entry|\t\$workEntry: |$workEntry|";
			$entryType = 2;
		}
		else {$workEntry = $entry;}

		my $foundText;
		#say LOG "<8.1>\t\$entryType: $entryType\t\$entry: $entry\t\$workEntry: $workEntry";
		if ($workText =~ /\b$workEntry\b/p) {
			say LOG "<8.1>Found: \t\$entryType: $entryType\t\$entry: $entry\t\$workEntry: $workEntry";
				($foundText, $workText) = ($&, "${^PREMATCH}ı${^POSTMATCH}");
				my ($place, $foundTextLength, $replacementSpaces) = (length ${^PREMATCH}, length $foundText, "");
				$line =~ s/></>$entry</;
				while (length $replacementSpaces < $foundTextLength) {$replacementSpaces .= " "}

				if ($entryType) {

					say LOG "<8.2>\n\$workText,: $workText,	 \$matchedLines:\n$matchedLines	";

					($workText) = FixWorkText($line, $workText, $workEntry, $foundText, $foundTextLength, $first, $firstLen, $second, $secondLen, $third, $thirdLen);

					say LOG "<8.3>\n\$workText:\n$workText\n\$matchedLines:\n$matchedLines";

				}

				else {$workText =~ s/ı/$replacementSpaces/;}

				$matchedLines .= "$line◊$place";

				say LOG "<8.4>\tAfter found, new \$workText:\n$workText";
				$found = 1;
			}
		else {
			#say LOG "\$workEntry $workEntry not found"
		}
		if ($found) {
			last
		}
	}
	unless ($found) {
		$line =~ s/></>√</;
		$linesNotMatched .= "$line\n"
	}
	return ($matchedLines, $workText, $linesNotMatched)
}
sub FixWorkText {
	my ($thisLine, $text, $entry, $foundText, $foundTextLength, $first, $firstLen, $second, $secondLen, $third, $thirdLen) = @_;
	my ($firstSpace, $secondSpace, $thirdSpace);
	while (length $firstSpace < $firstLen) {$firstSpace .= " "}
	while (length $secondSpace < $secondLen) {$secondSpace .= " "}
	while (length $thirdSpace < $thirdLen) {$thirdSpace .= " "}

	say LOG "<9>\$text:\n$text\n\t\t\$entry: $entry	\$foundText: $foundText\t \$foundTextLength: $foundTextLength\t\$first: $first\t\$second: $second\t\$third: $third\n\$firstSpace: $firstSpace\t\$secondSpace: $secondSpace\t\$thirdSpace: $thirdSpace";
	if ($third) {
		if ($foundText =~ /$first(.*)$second(.*)$third/) {
			my ($firstGap, $secondGap) = ($1, $2);
			my $repText = "$firstSpace$firstGap$secondSpace$secondGap$thirdSpace";
			say LOG "<9.1> \$repText: $repText";
			$text =~ s/ı/$repText/;
		}
	}
	else {
		if ($foundText =~ /$first(.*)$second/) {
			my ($firstGap) = ($1);
			say LOG "<9.2>\t\$firstSpace: |$firstSpace|\t\$firstGap: |$firstGap|\t\$secondSpace: |$secondSpace|";
			my $repText ="$firstSpace$firstGap$secondSpace";
			say LOG "<9.3> \$repText: |$repText|";
			$text =~ s/ı/$repText/;
		}
	}
	return ($text)
}
sub Prepare_ULB_file {

	my $thisVerse;
	#$ULBText = read_file($ULBfile, binmode => 'utf8');

	#while ($ULBText =~ /<verse name="(.*?)">\n<preText>(.*?)<\/preText>\n.*<text>(.*?)<\/text>.*<\/verse>/sg) {
	#	($ULBtextThisVerse{$1}, $ULBpreTextThisVerse{$1}) = ($3, $2);
	#}
	#

	open (my $file, "<:utf8", "$ULBfile") or die "$ULBfile:\n$!";

		while (my $thisLine = <$file>) {
			chomp $thisLine;
			if ($thisLine =~ /verse name="(.*?)"/) {
				$thisVerse = $1;
				#say LOG "$thisVerse:\n$thisLine"
			}
			elsif ($thisLine =~ /<preText>(.*?)<\/preText>/) {
				$ULBpreTextThisVerse{$thisVerse} = $1;
				#say LOG "$thisVerse:\n$ULBpreTextThisVerse{$thisVerse}"
			}
			elsif ($thisLine =~ /<text>(.*?)<\/text>/) {
				$ULBtextThisVerse{$thisVerse} = $1;
				#say LOG "$thisVerse:\n$ULBtextThisVerse{$thisVerse}"
			}
		}

	close $file;
}

__DATA__
01	gen	Genesis
02	exo	Exodus
03	lev	Leviticus
04	num	Numbers
05	deu	Deuteronomy
06	jos	Joshua
07	jdg	Judges
08	rut	Ruth
09	1sa	1 Samuel
10	2sa	2 Samuel
11	1ki	1 Kings
12	2ki	2 Kings
13	1ch	1 Chronicles
14	2ch	2 Chronicles
15	ezr	Ezra
16	neh	Nehemiah
17	est	Esther
18	job	Job
19	psa	Psalms
20	pro	Proverbs
21	ecc	Ecclesiastes
22	sng	Song of Solomon
23	isa	Isaiah
24	jer	Jeremiah
25	lam	Lamentations
26	ezk	Ezekiel
27	dan	Daniel
28	hos	Hosea
29	jol	Joel
30	amo	Amos
31	oba	Obadiah
32	jon	Jonah
33	mic	Micah
34	nam	Nahum
35	hab	Habakkuk
36	zep	Zephaniah
37	hag	Haggai
38	zec	Zechariah
39	mal	Malachi
41	mat	Matthew
42	mrk	Mark
43	luk	Luke
44	jhn	John
45	act	Acts
46	rom	Romans
47	1co	1 Corinthians
48	2co	2 Corinthians
49	gal	Galatians
50	eph	Ephesians
51	php	Philippians
52	col	Colossians
53	1th	1 Thessalonians
54	2th	2 Thessalonians
55	1ti	1 Timothy
56	2ti	2 Timothy
57	tit	Titus
58	phm	Philemon
59	heb	Hebrews
60	jas	James
61	1pe	1 Peter
62	2pe	2 Peter
63	1jn	1 John
64	2jn	2 John
65	3jn	3 John
66	jud	Jude
67	rev	Revelation