PDF OT cleanup

This commit is contained in:
Henry Whitney 2020-04-08 18:15:52 -04:00
parent bdf98fc499
commit f84af21350
2 changed files with 99 additions and 0 deletions

View File

@ -0,0 +1,14 @@
use 5.12.0;
use File::Find ;
use Cwd ;
my $topDir = "/Users/Henry/";
my @filesToRun = ();
my $filePattern = '*.md' ;
find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ;
foreach my $file ( @filesToRun )
{
print "$file\n" ;
}

View File

@ -0,0 +1,85 @@
use 5.12.0;
use File::Slurp;
use File::Find;
my $d = "/";
my (%entriesThisPage, %uniqueEntries, %sourcePage, %entriesThisSN);
my (@rawArray, @searchArray);
my ($dupList, $rawList, $irrelevantEntries);
open LOG, ">:utf8", "logs/log.log" or die;
open OUT, ">:utf8", "output/output.txt" or die;
say "Starting ...";
GetUniqueEntries();
SearchFile();
Output();
close OUT;
close LOG;
print "\n\tDone.";
sub GetUniqueEntries {
say "Pairing tW entries with tW pages and unique Strong's numbers";
my (@filesToRun, @relevantSNs) = ();
my $topTwDir = "/Users/Henry/Documents/WACS/en_tw/bible";
my $filePattern = '*.md' ;
find (sub {push @filesToRun, $File::Find::name if (m/^(.*)$filePattern$/)}, $topTwDir) ;
@filesToRun = sort @filesToRun;
#say LOG "\@filesToRun: @filesToRun";
foreach my $file (@filesToRun) {
print ".";
$file =~ s/\//\\/g unless $^O eq "linux" || $^O eq "darwin";
my ($thisList, $shortFile) = ("", $file);
$shortFile =~ s/^\Q$topTwDir${d}\E//;
$shortFile =~ s/\.md$//;
$shortFile =~ s/\Q$d\E/,/;
#say "|$shortFile|"; die;
#if ($shortFile =~ /^(kt|names)/) {
#my $fileText = read_file("$file", binmode => 'utf8');
open IN, $file or die "$!";
while (<IN>) {
if (/^# ([^\n]*)$/) {
$thisList = $1;
$thisList =~ s/[\r\n ]+$//;
#say LOG "\$thisList = |$thisList|";
$thisList =~ s/ \([^\)]*\)//g;
$thisList =~ s/ \.\.\. /.*?/g;
$rawList .= $thisList . ", ";
}
close IN;
}
close IN;
}
@rawArray = split /, /, $rawList;
my @sortedArray = reverse sort { substr($a,0,1) <=> substr($b,0,1)
|| length($a) <=> length($b)
|| $a <=> $b }
@rawArray;
foreach my $slice (@sortedArray) {
if (exists $uniqueEntries{$slice}) {
$dupList .= "$slice\n"
} else {
push @searchArray, $slice
}
}
}
sub SearchFile {
say "Searching the file";
my $fileText = read_file("/Users/Henry/Documents/WACS/Tips_and_Hacks/MAST_tW_PDF_Updater/FilesForUpdates/Temp/ULB_text.txt", binmode => 'utf8');
my ($currentLength, $newLength);
foreach my $thisEntry (@searchArray) {
$currentLength = length $fileText;
$fileText =~ s/$thisEntry//g;
$newLength = length $fileText;
if ($newLength == $currentLength) {
$irrelevantEntries .= $thisEntry . "\n"
}
}
}
sub Output {
say "Outputting";
say OUT "Entries to be deleted:\n" . $irrelevantEntries;
say OUT "\nDuplicate entries\n" . $dupList
}