en_ulb_tagged/Find_first_occurrence_of_pr...

60 lines
1.1 KiB
Perl

use 5.12.0;
use File::Slurp;
use File::Find ;
use Cwd ;
my %location;
open LOG, ">log/log.log" or die;
open(IN, "/Users/Henry/Documents/WACS/Tips_and_Hacks/MAST_tW_PDF_Updater/FilesForUpdates/Temp/ULB_text.txt") or die "$!";
say "Reading ULB";
while (<IN>) {
#print LOG "$_";
chomp;
while (s/^([^\n\t]*)\t([^\n]*?)([A-Z][a-z]+(-[A-Z][a-z]+)?)/$1\t$2/) {
# say LOG $3;
unless (exists $location{$3}) {$location{$3} = $1}
}
}
close IN;
say "Outputting hash";
open(OUT, ">out/results.txt") or die "$!";
foreach my $word (sort keys %location) {
say OUT "$word, $location{$word}";
}
close OUT;
say "Deleting common words";
my $fileText = read_file("/Users/Henry/Documents/WACS/Tips_and_Hacks/MAST_tW_PDF_Updater/FilesForUpdates/Temp/ULB_text.txt", binmode => 'utf8');
foreach my $word (sort keys %location) {
my $temp = lc $word;
#say LOG $word . "\t" . $temp;
if ($fileText =~ /\b$temp\b/) {
delete $location{$word}
}
}
say "Outputting final product";
open(OUT, ">out/results.txt") or die "$!";
foreach my $word (sort keys %location) {
say OUT "$word, $location{$word}";
}
close OUT;
close LOG;