From 92c7b7ca6e31d31a54a9d1884b87e6c04728ae49 Mon Sep 17 00:00:00 2001 From: Henry Whitney Date: Mon, 27 Apr 2020 13:35:01 -0400 Subject: [PATCH] Trying to make more useful --- .../Build_MAST_NT_from_csv.generic.pl | 103 ++++++++++++++++++ .../Find_duplicate_tW_entries.pl | 48 ++++++++ 2 files changed, 151 insertions(+) create mode 100755 MAST_tW_PDF_Updater/FilesForUpdates/Build_MAST_NT_from_csv.generic.pl create mode 100644 MAST_tW_PDF_Updater/FilesForUpdates/Find_duplicate_tW_entries.pl diff --git a/MAST_tW_PDF_Updater/FilesForUpdates/Build_MAST_NT_from_csv.generic.pl b/MAST_tW_PDF_Updater/FilesForUpdates/Build_MAST_NT_from_csv.generic.pl new file mode 100755 index 0000000..4a5439f --- /dev/null +++ b/MAST_tW_PDF_Updater/FilesForUpdates/Build_MAST_NT_from_csv.generic.pl @@ -0,0 +1,103 @@ +use 5.12.0; +use File::Slurp; +use File::Find ; +use Cwd ; +use utf8; +#use open IN => ":utf8", OUT => ":utf8"; +use open IO => ":utf8"; +binmode STDOUT, ":encoding(UTF-8)"; +mkdir "OGNT"; +mkdir "Logs"; + +open LOG, ">Logs/log.log"; + +my (%bk); +my ($last_bn, $last_ch, $last_vs, $bklc) = ("00", "00", "00"); + +say "Reading data ..."; +while () { + chomp; + if (/^(\d\d)-(...)/) { + $bk{$1} = $2; + } +} + + +#open IN, "/Users/Henry/Documents/WACS/Tips_and_Hacks/MAST_tW_PDF_Updater/FilesForUpdates/OGNT/OpenGNT_version3_3.csv" or die "$!"; +open IN, "OpenGNT_version3_3.csv" or die "$!"; + +say "Reading input ..."; + +while () { + chomp; + Separate(); +} + +say OUT " \n \n \n"; + +say "Closing input and output files ..."; + +close OUT; +close IN; +close LOG; + +say "Done."; + +sub Separate { + if (/[^\t]*\t[^\t]*\t[^\t]*\t[^\t]*\t.\t[^\t]*\t〔(\d+)|(\d+)|(\d+)〕\t〔[^\|]*|[^\|]*|([^\|]*)|([^\|]*)|([^\|]*)|([^\|]*)〕/) { + my ($bn, $ch, $vs, $word, $lexeme, $gram, $sn) = ($1, $2, $3, $4, $5, $6, $7); + say LOG "$1, $2, $3, $4, $5, $6, $7"; + $sn =~ s/[GH]//; + $bn = $bn + 1; + if ($bn ne $last_bn) { + my ($this_bk) = ($bk{$bn}); + $bklc = lc $bk{$bn}; + if (OUT-> opened()) { + say OUT " \n \n \n"; + close OUT; + } + open OUT, ">:utf8", "OGNT/$bn-$bk{$bn}.xml" or die "$! $bn-$bk{$bn}.xml"; + say OUT "\n\n
\n \n "; + ($last_bn, $last_ch, $last_vs) = ($bn, $ch, $vs) + } + elsif ($ch ne $last_ch) { + say OUT " \n \n \n "; + ($last_ch, $last_vs) = ($ch, $vs) + } + elsif ($vs ne $last_vs) { + my ($this_bk, $bklc) = ($bk{$bn}, lc $bk{$bn}); + say OUT " \n "; + $last_vs = $vs; + } + say OUT "\t\t\t\t$word" + } +} + +__DATA__ +41-MAT.xml +42-MRK.xml +43-LUK.xml +44-JHN.xml +45-ACT.xml +46-ROM.xml +47-1CO.xml +48-2CO.xml +49-GAL.xml +50-EPH.xml +51-PHP.xml +52-COL.xml +53-1TH.xml +54-2TH.xml +55-1TI.xml +56-2TI.xml +57-TIT.xml +58-PHM.xml +59-HEB.xml +60-JAS.xml +61-1PE.xml +62-2PE.xml +63-1JN.xml +64-2JN.xml +65-3JN.xml +66-JUD.xml +67-REV.xml diff --git a/MAST_tW_PDF_Updater/FilesForUpdates/Find_duplicate_tW_entries.pl b/MAST_tW_PDF_Updater/FilesForUpdates/Find_duplicate_tW_entries.pl new file mode 100644 index 0000000..bd8bf39 --- /dev/null +++ b/MAST_tW_PDF_Updater/FilesForUpdates/Find_duplicate_tW_entries.pl @@ -0,0 +1,48 @@ +use 5.12.0; +use File::Slurp; +use File::Find ; +use Cwd ; +use utf8; +#use open IN => ":utf8", OUT => ":utf8"; +use open IO => ":utf8"; + +open LOG, ">:utf8", "Logs/log.log" or die; +open OUT, ">:utf8", "Output/output.txt" or die; + +my $topDir = "/Users/Henry/Documents/WACS/en_tw/bible"; +my $filePattern = '\.md' ; +my %foundEntries; +my @theseEntries; + +my @filesToRun = (); +find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ; + +foreach my $file ( @filesToRun ) { + say LOG $file; + my $fileText = read_file("$file", binmode => 'utf8'); + my $theseEntries; + if ($fileText =~ /^# ([^\n]*?)\n/) { + $theseEntries = $1; + @theseEntries = split /, /, $theseEntries; + ParseEntries(); + } +} + +close OUT; +close LOG; + +print "\n\tDone."; + +# ============ + +sub ParseEntries { + foreach my $entry (@theseEntries) { + $entry =~ s/ \([^\)]*\)//g; + if (exists $foundEntries{$entry}) { + say OUT $entry; + } else { + $foundEntries{$entry} = $entry + } + } + @theseEntries = ""; +} \ No newline at end of file