PDF OT cleanup
This commit is contained in:
parent
bdf98fc499
commit
f84af21350
|
@ -0,0 +1,14 @@
|
|||
use 5.12.0;
|
||||
use File::Find ;
|
||||
use Cwd ;
|
||||
|
||||
my $topDir = "/Users/Henry/";
|
||||
|
||||
my @filesToRun = ();
|
||||
my $filePattern = '*.md' ;
|
||||
find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ;
|
||||
|
||||
foreach my $file ( @filesToRun )
|
||||
{
|
||||
print "$file\n" ;
|
||||
}
|
|
@ -0,0 +1,85 @@
|
|||
use 5.12.0;
|
||||
use File::Slurp;
|
||||
use File::Find;
|
||||
my $d = "/";
|
||||
my (%entriesThisPage, %uniqueEntries, %sourcePage, %entriesThisSN);
|
||||
my (@rawArray, @searchArray);
|
||||
my ($dupList, $rawList, $irrelevantEntries);
|
||||
|
||||
open LOG, ">:utf8", "logs/log.log" or die;
|
||||
open OUT, ">:utf8", "output/output.txt" or die;
|
||||
|
||||
say "Starting ...";
|
||||
GetUniqueEntries();
|
||||
SearchFile();
|
||||
Output();
|
||||
|
||||
close OUT;
|
||||
close LOG;
|
||||
|
||||
print "\n\tDone.";
|
||||
|
||||
sub GetUniqueEntries {
|
||||
say "Pairing tW entries with tW pages and unique Strong's numbers";
|
||||
my (@filesToRun, @relevantSNs) = ();
|
||||
my $topTwDir = "/Users/Henry/Documents/WACS/en_tw/bible";
|
||||
my $filePattern = '*.md' ;
|
||||
find (sub {push @filesToRun, $File::Find::name if (m/^(.*)$filePattern$/)}, $topTwDir) ;
|
||||
@filesToRun = sort @filesToRun;
|
||||
#say LOG "\@filesToRun: @filesToRun";
|
||||
foreach my $file (@filesToRun) {
|
||||
print ".";
|
||||
$file =~ s/\//\\/g unless $^O eq "linux" || $^O eq "darwin";
|
||||
my ($thisList, $shortFile) = ("", $file);
|
||||
$shortFile =~ s/^\Q$topTwDir${d}\E//;
|
||||
$shortFile =~ s/\.md$//;
|
||||
$shortFile =~ s/\Q$d\E/,/;
|
||||
#say "|$shortFile|"; die;
|
||||
#if ($shortFile =~ /^(kt|names)/) {
|
||||
#my $fileText = read_file("$file", binmode => 'utf8');
|
||||
open IN, $file or die "$!";
|
||||
while (<IN>) {
|
||||
if (/^# ([^\n]*)$/) {
|
||||
$thisList = $1;
|
||||
$thisList =~ s/[\r\n ]+$//;
|
||||
#say LOG "\$thisList = |$thisList|";
|
||||
$thisList =~ s/ \([^\)]*\)//g;
|
||||
$thisList =~ s/ \.\.\. /.*?/g;
|
||||
$rawList .= $thisList . ", ";
|
||||
}
|
||||
close IN;
|
||||
}
|
||||
close IN;
|
||||
}
|
||||
@rawArray = split /, /, $rawList;
|
||||
my @sortedArray = reverse sort { substr($a,0,1) <=> substr($b,0,1)
|
||||
|| length($a) <=> length($b)
|
||||
|| $a <=> $b }
|
||||
@rawArray;
|
||||
foreach my $slice (@sortedArray) {
|
||||
if (exists $uniqueEntries{$slice}) {
|
||||
$dupList .= "$slice\n"
|
||||
} else {
|
||||
push @searchArray, $slice
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub SearchFile {
|
||||
say "Searching the file";
|
||||
my $fileText = read_file("/Users/Henry/Documents/WACS/Tips_and_Hacks/MAST_tW_PDF_Updater/FilesForUpdates/Temp/ULB_text.txt", binmode => 'utf8');
|
||||
my ($currentLength, $newLength);
|
||||
foreach my $thisEntry (@searchArray) {
|
||||
$currentLength = length $fileText;
|
||||
$fileText =~ s/$thisEntry//g;
|
||||
$newLength = length $fileText;
|
||||
if ($newLength == $currentLength) {
|
||||
$irrelevantEntries .= $thisEntry . "\n"
|
||||
}
|
||||
}
|
||||
}
|
||||
sub Output {
|
||||
say "Outputting";
|
||||
say OUT "Entries to be deleted:\n" . $irrelevantEntries;
|
||||
say OUT "\nDuplicate entries\n" . $dupList
|
||||
}
|
Loading…
Reference in New Issue