en_tw/ForPDF/FilesForUpdates/FindMismatchedULBSnippets.2...

# Before running this program, run
# find -s . "*.md"
# on
# /Users/Henry/Documents/git.Door43/en_tn/
# and paste the output into dir.dir in that directory

use 5.12.0;
use File::Slurp;
use utf8;
#use open IN => ":utf8", OUT => ":utf8";
use open IO => ":utf8";
use Cwd;
use File::Find ;

$" = "\n";

my $pwd = cwd();
my $topDir = "/Users/Henry/Documents/git.Door43/en_tn";

my (@filesToRun, @array);
my $filePattern = '*.md' ;
my (%abbrev, %full, %ulb);

find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ && !m/(LICENSE|README|intro)\.$filePattern$/ ) }, $topDir) ;

my %toDummy = (" \\.\\.\\. ", ".*", "\\?", "QM", "\"", "QD", "\'", "QS", "\!", "XM", "\\(", "QOXP", "\\)", "QCP");
my %fromDummy = ("\\.\\*", " ... ", "QM", "?", "QD", "\"", "QS", "'", "XM", "!", "QOXP", "(", "QCP", ")");


open LOG, ">/Users/Henry/Google Drive/WA/Test/out/log.log" or die;

#open output
open OUT, ">/Users/Henry/Google Drive/WA/tN instructions/mismatched_snippets.html" or die;

#Read in ULB

ReadData();
ReadULB();
ProcessFiles();

sub ReadData {
	while (<DATA>) {
		chomp;
		if (/([^\t]*)\t([^\t]*)\t(.*)/) {
			($abbrev{$3}, $full{$2}) = ($2, $3)
		}
	}
	#foreach my $key (sort keys %abbrev) {say LOG "$key\t$abbrev{$key}"}
	#foreach my $key (sort keys %full) {say LOG "$key\t$full{$key}"}
}

sub ReadULB {
	open IN, "/Users/Henry/Google Drive/WA/Test/Unlocked Bible/ULB text.txt" or die;
		my ($checkText, $thisChunkText, $thisRef, $thisBook, $thisChap, $thisVerse, $id, $thisText, $tempText, $thisKey);
		while (<IN>) {
			chomp;
			#say LOG ">$_<";
			if (/^([^\t]*)\t(.*)$/) {
				my ($tempID, $tempText) = ($1, $2);
				#say LOG "<$tempID>\t|$tempText|";
						#say LOG "\$id, \$thisText\t$id, $thisText";
					($id) = ($tempID);
					if ($id =~ /^([^:]*) (\d+):(\d+)/) {
						($thisBook, $thisChap, $thisVerse) = ($1, $2, $3);
						if ($thisBook ne "Psalms") {
							while (length $thisChap < 2) {$thisChap =~ s/^/0/};
							while (length $thisVerse < 2) {$thisVerse =~ s/^/0/}
						} else {
							while (length $thisChap < 3) {$thisChap =~ s/^/0/};
							while (length $thisVerse < 3) {$thisVerse =~ s/^/0/}
						}
						$thisBook = $abbrev{$thisBook};
				}
				$id = "$thisBook/$thisChap/$thisVerse";
				$tempText =~ s/\\f \+.*?\\f\*//g;
				#say LOG $tempText;
				$tempText =~ s/ {2,}/ /g;
				$ulb{$id} .= "$tempText ";
				$ulb{$id} =~ s/— /—/g;
				$ulb{$id} =~ s/ —/—/g;
				#say LOG "\$id = $id\n\$ulb{$id} = $ulb{$id}";
			}
		}
	close IN;
	#say LOG "Hi";
	#foreach my $key (sort keys %ulb) {say LOG "|$key|\t<$ulb{$key}>"}
}

#	assign passages as values to chunk keys

#Read in each file
sub ProcessFiles {
	foreach my $slice (@filesToRun) {
		#say LOG ">>\$slice: $slice<<";
		my ($thisText, $thisNote, $textReserved, $curRef, $tb, $ct, $vt, $anchor);
		if ($slice =~ /^.*\/(([^\.]*)\/([^\.]*)\/([^\.]*)).md$/) {
			$curRef = $1;
			$anchor = $1;
			($tb, $ct, $vt) = ($2, $3, $4);
			$tb = $full{$tb};
			$ct =~ s/^0+//;
			$vt =~ s/^0+//;
			#say LOG ">3>$anchor > $tb $ct:$vt<3<";
			$thisText = $ulb{$anchor};
			#say LOG ">5>\$anchor: $anchor; \$thisText:\n$thisText<5<";
		}
		#my $tN = read_file("$slice", binmode => 'utf8') or die "|$slice|\n$!";
		my $tN = read_file("$slice", binmode => 'utf8');
		#my $tN = read_file("$slice", binmode => 'utf8') or next DoFile;
		#say LOG ">6>\$slice: $slice; \$tN:\n$tN<6<";
		foreach my $key (sort keys %toDummy) {
			#say LOG "$key\t|$toDummy{$key}|";
			$tN =~ s/$key/$toDummy{$key}/g;
			$thisText =~ s/$key/$toDummy{$key}/g;
		}
		$tN =~ s/# ((General Information|Connecting Statement|translationWords):?)[^\r\n]*\r?\n//g;
		$tN =~ s/\* \[\[[^\r\n]*\r?\n//g;
		$tN =~ s/(#[^\r\n]*\r?\n)[^\r\n]*\r?\n[^\r\n]*\r?\n/$1/g;
		# ">7>\n\n\n\$curRef: $curRef\n\$thisText:$thisText\n\$tN: $tN<7<";
		while ($tN =~ /# ([^\r\n]*)\r?\n/g) {
			$thisNote = $1;
			#say LOG ">8>\t>\t|$thisNote|\n$thisText<8<";
			$thisText =~ s/ {2,}/ /g;
			#say LOG ">9>\t>\t|$thisNote|\n$thisText<9<";
			unless ($thisText =~ /$thisNote/) {
				say LOG ">A>\n$tb $ct:$vt\n$slice\n$thisNote\n$thisText\n<A<";
				foreach my $key (sort keys %fromDummy) {
					$thisNote =~ s/$key/$fromDummy{$key}/g;
					$thisText =~ s/$key/$fromDummy{$key}/g;
				}
				push @array, "\n<p>$tb $ct:$vt</p>\n<p><b>$slice</b></p>\n<p><i>$thisNote</i></p>\n<p>$thisText</p>";
				#push @array, "\t$thisNote";
				}
		}
	}
}

print OUT "<?xml version=\"1.0\" encoding=\"utf-8\"?>
<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"
        \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">
<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">
<head>
	<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\" />
	<title>Mismatched Snippets</title>
	<meta name=\"generator\" content=\"BBEdit 8.5\" />
</head>
<body>
@array
</body>
</html>";
close OUT;

close LOG;

say "Done.";

__DATA__
01	gen	Genesis
02	exo	Exodus
03	lev	Leviticus
04	num	Numbers
05	deu	Deuteronomy
06	jos	Joshua
07	jdg	Judges
08	rut	Ruth
09	1sa	1 Samuel
10	2sa	2 Samuel
11	1ki	1 Kings
12	2ki	2 Kings
13	1ch	1 Chronicles
14	2ch	2 Chronicles
15	ezr	Ezra
16	neh	Nehemiah
17	est	Esther
18	job	Job
19	psa	Psalms
20	pro	Proverbs
21	ecc	Ecclesiastes
22	sng	Song of Songs
23	isa	Isaiah
24	jer	Jeremiah
25	lam	Lamentations
26	ezk	Ezekiel
27	dan	Daniel
28	hos	Hosea
29	jol	Joel
30	amo	Amos
31	oba	Obadiah
32	jon	Jonah
33	mic	Micah
34	nam	Nahum
35	hab	Habakkuk
36	zep	Zephaniah
37	hag	Haggai
38	zec	Zechariah
39	mal	Malachi
41	mat	Matthew
42	mrk	Mark
43	luk	Luke
44	jhn	John
45	act	Acts
46	rom	Romans
47	1co	1 Corinthians
48	2co	2 Corinthians
49	gal	Galatians
50	eph	Ephesians
51	php	Philippians
52	col	Colossians
53	1th	1 Thessalonians
54	2th	2 Thessalonians
55	1ti	1 Timothy
56	2ti	2 Timothy
57	tit	Titus
58	phm	Philemon
59	heb	Hebrews
60	jas	James
61	1pe	1 Peter
62	2pe	2 Peter
63	1jn	1 John
64	2jn	2 John
65	3jn	3 John
66	jud	Jude
67	rev	Revelation