#!/usr/bin/perl -w use strict; my(%onsets, $corpus, $count, $limit, $onset, $size, @sizes, %consonants, $sonority, @risers, @fallers); #define sonority classes my @obstruents = qw(b d g p t k c j f v s z x); my @nasals = qw(m n); my @liquids = qw(r l); my @glides = qw(w y h); #add sonority classes to %consonants hash foreach my $c (@obstruents) { $consonants{$c} = 1; } foreach my $c (@nasals) { $consonants{$c} = 2; } foreach my $c (@liquids) { $consonants{$c} = 3; } foreach my $c (@glides) { $consonants{$c} = 4; } #initialize sonority counts for (my $i = 0; $i < 5; $i++) { $risers[$i] = 0; $fallers[$i] = 0; } #get command-line arguments if (@ARGV == 1) { $corpus = $ARGV[0]; $limit = 0; } elsif (@ARGV == 2) { $corpus = $ARGV[0]; $limit = $ARGV[1]; } else { die "Usage: perl cluster.pl corpus-file (word-limit)\n"; } #number of words parsed $count = 0; #open file open F, $corpus or die "Can't open $corpus\n"; #read the corpus line by line while (my $line = ) { chomp $line; #eliminate possessive apostrophe $line =~ s/(\'s)(\W)/s$2/g; #eliminate line-final punctuation $line =~ s/^(.*)\W$/$1/; #split the line into words my @words = split /[\s\-]+/, $line; #go through each word foreach my $word (@words) { #skip numbers next if ($word =~ /\d/); #skip abbreviations next if ($word =~ /\./); #skip items with asterisks next if ($word =~ /\*/); #skip rbi next if ($word =~ /^rb/); #eliminate remaining punctuation $word =~ s/\W//g; #normalize onset letters $onset = lc $word; #accommodate initial if ($onset =~ /^y/) { $onset = "y"; } else { $onset =~ s/^([^yaeiouYAEIOU]*)[yaeiouYAEIOU].*$/$1/; } #these simplify clusters in various ways $onset =~ s/hh+/h/; $onset =~ s/^wr$/r/; $onset =~ s/^tt$/t/; $onset =~ s/s[zs]+/s/g; $onset =~ s/(.)c?h/$1/g; $onset =~ s/t[scg]/t/; $onset =~ s/q/kw/; $onset =~ s/ff+/f/; $onset =~ s/nn+/n/; $onset =~ s/mm+/m/; $onset =~ s/pp+/p/; $onset =~ s/ll+/l/; $onset =~ s/^p([stn])$/$1/; #record the count $onsets{$onset}++; } $count += @words; #exit if the word-limit has been exceeded last if $count > $limit; } #close file close F; #print the total number of words print "Count = $count\n"; #collect counts for different onset sizes foreach my $key (sort keys %onsets) { $size = length $key; if (sonorityRise($key)) { $risers[$size] += $onsets{$key}; } else { $fallers[$size] += $onsets{$key}; } $sizes[$size] += $onsets{$key}; } #print out counts for different onset sizes print "size of onset: frequency\n"; for ($size = 0; $size < @sizes; $size++) { print "$size: $sizes[$size]\n"; } #print out counts for sonority print "\nsonority results\n"; for (my $i = 0; $i < 5; $i++) { print "$i\n"; print "\tRising: $risers[$i]\n"; print "\tFalling: $fallers[$i]\n"; } #determines whether a cluster rises in sonority (1) or not (0) sub sonorityRise { my $onset = shift; my $lgth = length $onset; return 1 if $lgth < 2; my $cur = 0; for (my $i = 0; $i < $lgth; $i++) { return 0 if $cur >= $consonants{substr($onset, $i, 1)}; $cur = $consonants{substr($onset, $i, 1)}; } return 1; } =head1 NAME cluster - onset cluster statistics in Perl =head1 COMMAND-LINE SYNOPSIS perl cluster.pl corpus-file (word-limit) =head1 DESCRIPTION This program allows the user to compute various statistics about onset clusters from an orthographic text file, including the relative frequency or different cluster sizes and the relative frequency of clusters that satisfy the sonority hierarchy. The program assumes each sentence is on a separate line. =head1 VERSION 1.0 (Mar. 2007) =head1 AUTHOR Michael Hammond, F =cut