forked from WycliffeAssociates/en_ulb_tagged
58 lines
1.2 KiB
Perl
58 lines
1.2 KiB
Perl
# Extracts all mappings from Greek text of manually tagged ULB
|
|
|
|
use 5.18.0;
|
|
use File::Slurp;
|
|
use File::Find ;
|
|
use List::MoreUtils qw(uniq);;
|
|
use Cwd ;
|
|
use utf8;
|
|
#use open IN => ":utf8", OUT => ":utf8";
|
|
use open IO => ":utf8";
|
|
|
|
open(LOG, ">Logs/Log.txt") or die "$!";
|
|
|
|
my $topDir = "/Users/Henry/Documents/WACS/en_ulb_tagged/Manual_Tagging";
|
|
|
|
my %mapTo;
|
|
|
|
my @filesToRun = ();
|
|
my $filePattern = '\.xml' ;
|
|
my $file;
|
|
find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ;
|
|
|
|
ReadFiles();
|
|
Output();
|
|
|
|
close LOG;
|
|
|
|
say "\nDone.";
|
|
# =====
|
|
sub ReadFiles {
|
|
|
|
foreach $file ( @filesToRun ) {
|
|
my $fileText = read_file("$file", binmode => 'utf8');
|
|
while ($fileText =~ /text="([^><\n"]*)">([^><\n"]*)<\/w>/g) {
|
|
my ($text, $raw) = ($1, $2);
|
|
$raw =~ s/[^\w]+$//;
|
|
$mapTo{$text} .= "$raw, ";
|
|
# say "$1: $mapTo{$1}"
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
sub Output {
|
|
$" = ", ";
|
|
open(OUT, ">Output/map.tab") or die "$!";
|
|
foreach my $text (sort keys %mapTo) {
|
|
#say "$text: $mapTo{$text}";
|
|
my @array = split/, /, $mapTo{$text};
|
|
#say "@array";
|
|
#my @pruned = keys {map {$_ => 1} @array};
|
|
my @pruned = uniq @array;
|
|
say OUT $text . "\t" . "@pruned"
|
|
}
|
|
close OUT;
|
|
}
|