#!/usr/bin/perl -w # Auf Grundlage von mitglieder-wb.pl: 10/2001: Taal en Spraak Materiaal # converter: XYZA -> X AY AZ A und so weiter # ./MateriaalTaal.pl MateriaalTaal.html my $nr = 1; my %sentences; # $hash(index) my %users; my $NONE = 99; # value for "n/a" my $maxsent = 0; ############################################################ open(LOGFILE,") { chomp; tr/\&/\;/; $dings = $_; if ($dings =~ /=/) { @parts = split(/\;/,$dings); foreach (@parts) { ($a,$b) = split(/=/,$_); ($a1,$a2) = split(/\./,$a); if ($a1 > $maxsent) { $maxsent = $a1; } # print STDERR "$nr $a1 $a2 $b\n"; if (($a2 < 1) || ($a2 > 3)) { die "Ouch... case not 1..3\n"; } $sentences{$nr . "." . $a} = $b; # store value for one item } $nr = $nr+1; }; # else date } close(LOGFILE); my $subj = $nr; $maxsent = $maxsent + 1; ############################################################ $nr = 1; print "Killing partial data\n"; for ($s=1;$s<$subj;$s++) # for all subjects { # print "Subject $s\n"; for ($sent=1;$sent<$maxsent;$sent++) # for all sentences { # print "Sentence $sent\n"; for ($case=1;$case<4;$case++) # for all cases { $a = "" . $s . "." . $sent . "." . $case; if (!defined $sentences{$a}) # kill whole sentence # for this subject # if data is incomplete { $a = "" . $s . "." . $sent . ".1"; $sentences{$a} = $NONE; $a = "" . $s . "." . $sent . ".2"; $sentences{$a} = $NONE; $a = "" . $s . "." . $sent . ".3"; $sentences{$a} = $NONE; } # print "Case " . $case . " is " . $sentences{$a} . "\n"; } } } ############################################################ print "Storing raw data in spss-taal.txt\n"; open (SPSSFILE,">spss-taal.txt") || die "spss-taal.txt write error\n"; for ($s=1;$s<$subj;$s++) # for all subjects { for ($sent=1;$sent<$maxsent;$sent++) # for all sentences { for ($case=1;$case<4;$case++) # for all cases { $a = "" . $s . "." . $sent . "." . $case; print SPSSFILE "" . $sentences{$a} . " "; } } print SPSSFILE "\r\n"; } close(SPSSFILE); ############################################################ ############################################################ # next, we normalize the data for each subject $nr = 1; my $sum = 0; my $cnt = 0; my $adj = 0; for ($s=1;$s<$subj;$s++) # for all subjects { print "Normalizing Subject $s ... "; $sum = 0.0; $cnt = 0; $adj = 0.0; for ($sent=1;$sent<$maxsent;$sent++) # for all sentences { for ($case=1;$case<4;$case++) # for all cases { $a = "" . $s . "." . $sent . "." . $case; if ((defined $sentences{$a}) && (abs($sentences{$a} - $NONE) > 0.5)) { $sum = $sum + $sentences{$a}; $cnt = $cnt + 1 } } } if ($cnt==0) { die "Div by zero in average\n"; } $adj = 0.0 - ($sum / $cnt); # subtract average to make it 0 printf "Sum %3.3f in %3.3f, Adjust: add %3.3f\n", $sum, $cnt, $adj; $sum = 0; for ($sent=1;$sent<$maxsent;$sent++) # for all sentences { for ($case=1;$case<4;$case++) # for all cases { $a = "" . $s . "." . $sent . "." . $case; if ((defined $sentences{$a}) && (abs($sentences{$a} - $NONE) > 0.5)) { $sentences{$a} = $sentences{$a} + $adj; $sum = $sum + ($sentences{$a} * $sentences{$a}); # sum squares of (value-mean) after making mean 0 } } } if (($cnt==0) || ($sum==0)) { die "Div by zero in variance\n"; } $sum = $sum / (0.0 + $cnt); printf "Variance is %5.5f ", $sum; $sum = sqrt($sum); $adj = 1.0 / $sum; # divide by std dev to make std dev 1.0 printf "Standard Deviation is %5.5f, Adjust: multiply by %5.5f\n", $sum, $adj; for ($sent=1;$sent<$maxsent;$sent++) # for all sentences { for ($case=1;$case<4;$case++) # for all cases { $a = "" . $s . "." . $sent . "." . $case; if ((defined $sentences{$a}) && (abs($sentences{$a} - $NONE) > 0.5)) { $sentences{$a} = $sentences{$a} * $adj; } } } } ############################################################ ############################################################ # next, we combine all subjects to get the mean and sd for # every part of every sentence # NEW: plus sum of squared... and number of subjects for each print "Calculating mean and standard deviation per sentence and part\n"; for ($sent=1;$sent<$maxsent;$sent++) # for all sentences { for ($case=1;$case<4;$case++) # for all cases { $b = "" . $sent . "." . $case; my $mean = 0.0; my $count = 0; my $sd = 0.0; # calculate mean for ($s=1;$s<$subj;$s++) # for all subjects { $a = "" . $s . "." . $sent . "." . $case; if ((defined $sentences{$a}) && (abs($sentences{$a} - $NONE) > 0.5)) { $count++; $mean += $sentences{$a}; } } $sentences{$b . ".num"} = $count; # NEW: save count if ($count == 0) { print "No data available on sentence.part $b\n"; $sentences{$b . ".sqsum"} = 0; # NEW: (save sum of squared...) } else { $mean /= $count; # we have the mean among all subjects for this sentence and part, # now calculate SD for it for ($s=1;$s<$subj;$s++) # for all subjects { $a = "" . $s . "." . $sent . "." . $case; if ((defined $sentences{$a}) && (abs($sentences{$a} - $NONE) > 0.5)) { $sd += (($sentences{$a} - $mean) * ($sentences{$a} - $mean)); } } $sentences{$b . ".sqsum"} = $sd; # NEW: save sum of squared... $sd /= $count; # now sd contains variance $sd = sqrt($sd); # now sd contains std dev } $sentences{$b . ".sd"} = $sd; # store 0.0 or SD $sentences{$b . ".mean"} = $mean; # store 0.0 or mean printf "sent.part %s: mean %5.5f std dev: %5.5f " . "(%5.5d samples)\n", $b, $mean, $sd, $count; } } ############################################################ # calculate global mean, MSwithin, MSbetween, and F value # for each sentence # see source to see the formula behind this! for ($sent=1;$sent<$maxsent;$sent++) # for all sentences { $a = "" . $sent; # print STDERR "Sent: " . $a . "\n"; my $globalmean = ( $sentences{$a . ".1.mean"} + $sentences{$a . ".2.mean"} + $sentences{$a . ".3.mean"} ) / 3.0; $sentences{$sent . ".mean"} = $globalmean; my $msb = $sentences{$a . ".1.num"} * ( ( ($sentences{$a . ".1.mean"} - $globalmean) ** 2 ) + ( ($sentences{$a . ".2.mean"} - $globalmean) ** 2 ) + ( ($sentences{$a . ".3.mean"} - $globalmean) ** 2 ) ); # sent.case.num ist supposed to be the same for all # values of case... (we filtered this at the beginning). $sentences{$a . ".msb"} = $msb; # MSB is thus roughly the variance of means my $msw = $sentences{$a . ".1.sqsum"} + $sentences{$a . ".2.sqsum"} + $sentences{$a . ".3.sqsum"}; $sentences{$a . ".msw"} = $msw; # MSW is kind of a sum of variances if (abs($msw) < 0.000000001) { printf "MSW found to be zero for sentence " . $sent . ", assuming 0.000000001\n"; $msw = 0.000000001; } my $fval = ($msb / (3-1)) / ($msw / ($sentences{$a . ".1.num"}-3)); # 3: number of cases sent.case.num: same for all cases # so 3-1 is df_1 and num-3 is df_2 (both values are # important for selection of F_critical). $sentences{$sent . ".fval"} = $fval; # the famous F value, see below for explanation printf "sent: %3.3d F: %5.5f mean: %5.5f in: %3.3d MSB: %5.5f " . " MSW: %5.5f\n", $sent, $fval, $globalmean, $sentences{$sent . ".1.num"}, $msb, $msw; } # F value: "a measure how different the means are relative to # the variability within each sample" # "the greater this value, the greater the likelihood that # differences between the means are due to something other # than chance alone" # http://www.psychstat.smsu.edu/introbook/sbk27m.htm (-> SPSS...) # 1.0 would mean no effect at all, # F greater than a critical value -> significant effect, # critical value is based on number of groups and subjects... # usual value for alpha: 0.05 or even 0.01 ... affects Fcrit... ############################################################ # now we can print the data sorted by geom avg of mean distance! my @sorted; my %tosort; print "Storing the essence of our calculations in measures-taal.txt\n"; open (MEASFILE,">measures-taal.txt") || die "measures-taal.txt write error\n"; # sort by F-VALUE for ($sent=1;$sent<$maxsent;$sent++) # for all sentences { $tosort{$sent} = $sentences{$sent . ".fval"}; } # sort by value using an inline function - see man perlfunc @sorted = sort { $tosort{$a} <=> $tosort{$b} } keys %tosort; # <=> is num, cmp is alph for ($sent=1;$sent<$maxsent;$sent++) # for all sentences { my $sel = $sorted[$sent-1]; printf "%3.3d. choice: sent: %3.3d F: %5.5f df_1: 2 df_2: %3d\n", $sent, $sel, $sentences{$sel . ".fval"}, $sentences{$sel . ".1.num"} - 3; # df_1 is 3-1, 3 being the number of cases printf MEASFILE "S=%3.3d F=%3.3f df1=2 df2=%3.3d MSB=%3.3f MSW=%3.3f" . " M1=%3.3f M2=%3.3f M3=%3.3f SD1=%3.3f SD2=%3.3f SD3=%3.3f\n", $sel, $sentences{$sel . ".fval"}, $sentences{$sel . ".1.num"}, $sentences{$sel . ".msb"}, $sentences{$sel . ".msw"}, $sentences{$sel . ".1.mean"}, $sentences{$sel . ".2.mean"}, $sentences{$sel . ".3.mean"}, $sentences{$sel . ".1.sd"}, $sentences{$sel . ".2.sd"}, $sentences{$sel . ".3.sd"}; } close(MEASFILE); exit;