en_ulb_tagged/Extract_Concordance.pl

58 lines
1.2 KiB
Perl

# Extracts all mappings from Greek text of manually tagged ULB
use 5.18.0;
use File::Slurp;
use File::Find ;
use List::MoreUtils qw(uniq);;
use Cwd ;
use utf8;
#use open IN => ":utf8", OUT => ":utf8";
use open IO => ":utf8";
open(LOG, ">Logs/Log.txt") or die "$!";
my $topDir = "/Users/Henry/Documents/WACS/en_ulb_tagged/Manual_Tagging";
my %mapTo;
my @filesToRun = ();
my $filePattern = '\.xml' ;
my $file;
find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ;
ReadFiles();
Output();
close LOG;
say "\nDone.";
# =====
sub ReadFiles {
foreach $file ( @filesToRun ) {
my $fileText = read_file("$file", binmode => 'utf8');
while ($fileText =~ /text="([^><\n"]*)">([^><\n"]*)<\/w>/g) {
my ($text, $raw) = ($1, $2);
$raw =~ s/[^\w]+$//;
$mapTo{$text} .= "$raw, ";
# say "$1: $mapTo{$1}"
}
}
}
sub Output {
$" = ", ";
open(OUT, ">Output/map.tab") or die "$!";
foreach my $text (sort keys %mapTo) {
#say "$text: $mapTo{$text}";
my @array = split/, /, $mapTo{$text};
#say "@array";
#my @pruned = keys {map {$_ => 1} @array};
my @pruned = uniq @array;
say OUT $text . "\t" . "@pruned"
}
close OUT;
}