#!/usr/bin/perl -w
use strict; # use this for every damn script you ever write
use warnings; # use this for every damn script you ever write
use Fcntl qw(:flock); # lets you lock files to prevent others from using them while they're open
# name of file that contains a word list (dictionary)
# the file should contain nothing but words, each in a separate line
#
# each sound must be represented by one letter
# this is crucially used in building the alphabet and in
# generating the neighbors
#
my $dictionaryFile = "citation.txt" ;
# name of file that contains a list of words to calculate the density for.
# each word in a separate line
my $inputFile = "stimuli.txt" ;
# name of output file
my $outputFile = "stimuli_density.txt";
# STEP 1:
# create lookup table (load the dictionary to the working memory)
# and also collect the letters of the alphabet
open (INPUT, "<" . $dictionaryFile) or die "Can't read dictionary file $!";
flock(INPUT, LOCK_SH) or die "$!"; # lock file for shared use
my %dict; # holds the attested words of the language
my %alphabet; # table of phonemes
while () {
chomp;
$dict{$_} = 1; # record the existence of a word
# collect any new phonemes in current word
for (my $i=0; $i < length($_); $i++) {
unless(exists ($alphabet{substr($_,$i,1)})) {
$alphabet{substr($_,$i,1)} = 1;
}
}
}
close(INPUT) or die "$!";
# STEP 2:
# get stimuli and calculate neighborhood density
open (INPUT, "<" . $inputFile) or die "Can't read input file $!";
open (OUTPUT, ">" . $outputFile) or die "Can't write to output file $!";
flock(INPUT, LOCK_EX) or die "$!"; # lock file for exclusive use
flock(OUTPUT,LOCK_EX) or die "$!"; # lock file for exclusive use
while () {
# at this point, each word of the input file at a time
# will be in the special variable $_
chomp; # remove the "end of line" character from the current word
# print an empty line in the output for any empty line in the output
if (/^$/ or /^#/) {
print OUTPUT "$_\n";
next;
}
my $formCounter = 0; # a count of potential neighbors
my @foundCounter; # list of attested neighbors
# replace each segment with all others
for (my $pos = 0 ; $pos < length($_) ; $pos++){
my $currSeg = substr($_,$pos,1); # saves the current segment in postion "$pos" for later
# go over all the alphabet
for my $phon (sort keys %alphabet) {
if ($phon ne $currSeg) {
substr($_,$pos,1,$phon); # insert current phoneme in place of original phoneme
$formCounter++;
push(@foundCounter,$_) if exists($dict{$_});
}
}
substr($_,$pos,1,$currSeg); # puts the original segment in "$pos" back where it belongs
}
# delete each segment
for (my $pos = 0 ; $pos < length($_) ; $pos++){
my $currSeg = substr($_,$pos,1); # saves the original segment in position "$pos" for later
substr($_,$pos,1,""); # delete seg
$formCounter++;
push(@foundCounter,$_) if exists($dict{$_});
substr($_,$pos,0,$currSeg); # puts the original segment in "$pos" back where it belongs
}
# add every possible segment in every position
for (my $pos = 0 ; $pos <= length($_) ; $pos++){
for my $phon (sort keys %alphabet) {
substr($_,$pos,0,$phon); # insert a phoneme at position "$pos"
$formCounter++;
push(@foundCounter,$_) if exists($dict{$_});
substr($_,$pos,1,""); # remove the inserted phoneme
}
}
# print result (whether or not the input word was found in the dictionary)
unless (scalar(@foundCounter)==0 or exists($dict{$_}) or exists($dict{$_."mak"}) or exists($dict{$_."mek"})) {
print OUTPUT "$_\t";
# print the frequency (four decimal places) and the actual neighbors
# \t = tab (useful for opening the file in excel as a "tab delimited file")
print OUTPUT " " if (scalar(@foundCounter)<10);
# print OUTPUT scalar(@foundCounter) ," / $formCounter = \t", int((scalar(@foundCounter) / $formCounter)*10000)/10000 , "\t@foundCounter\n";
print OUTPUT scalar(@foundCounter) ," / $formCounter \t@foundCounter\n";
}
}
close(INPUT) or die "$!";
close(OUTPUT) or die "$!";
print "Output file created.";