# Extracts all mappings from Greek text of manually tagged ULB use 5.18.0; use File::Slurp; use File::Find ; use List::MoreUtils qw(uniq);; use Cwd ; use utf8; #use open IN => ":utf8", OUT => ":utf8"; use open IO => ":utf8"; open(LOG, ">Logs/Log.txt") or die "$!"; my $topDir = "/Users/Henry/Documents/WACS/en_ulb_tagged/Manual_Tagging"; my %mapTo; my @filesToRun = (); my $filePattern = '\.xml' ; my $file; find( sub { push @filesToRun, $File::Find::name if ( m/^(.*)$filePattern$/ ) }, $topDir) ; ReadFiles(); Output(); close LOG; say "\nDone."; # ===== sub ReadFiles { foreach $file ( @filesToRun ) { my $fileText = read_file("$file", binmode => 'utf8'); while ($fileText =~ /text="([^><\n"]*)">([^><\n"]*)<\/w>/g) { my ($text, $raw) = ($1, $2); $raw =~ s/[^\w]+$//; $mapTo{$text} .= "$raw, "; # say "$1: $mapTo{$1}" } } } sub Output { $" = ", "; open(OUT, ">Output/map.tab") or die "$!"; foreach my $text (sort keys %mapTo) { #say "$text: $mapTo{$text}"; my @array = split/, /, $mapTo{$text}; #say "@array"; #my @pruned = keys {map {$_ => 1} @array}; my @pruned = uniq @array; say OUT $text . "\t" . "@pruned" } close OUT; }