Merge tag 'herisvm-0.8.1' into dgit/sid

author: Andrew Shadura <andrewsh@debian.org> 2018-03-27 09:00:23 +0200
committer: Andrew Shadura <andrewsh@debian.org> 2018-03-27 09:00:23 +0200
commit: 3a7a7cb2ec4965de7a7125ff3fbcb19a75847fcd (patch)
tree: 5ba8acf931b8d803b31e7d6b4ff9ddbc3dbbbabb
parent: dcbd9ccf5345ff85848df806a50ac4b2c08aaa5f (diff)
parent: 9d064c7bcd947481e225c9693463e28fa281e1c6 (diff)
13 files changed, 285 insertions, 119 deletions
diff --git a/doc/NEWS b/doc/NEWS
index db74a4c..f20870a 100644
--- a/doc/NEWS
+++ b/doc/NEWS
@@ -1,3 +1,9 @@
 ======================================================================
+Version 0.8.1, Thu, 15 Mar 2018 15:28:46 +0300
+
+   heri-stat-addons: format string for max_dev was fixed
+
+======================================================================
 Version 0.1.0, Sat, 13 Jun 2015 12:53:02 +0300
-  initial publicly available release
+
+   initial publicly available release
diff --git a/doc/TODO b/doc/TODO
index 7c9dbaa..21da261 100644
--- a/doc/TODO
+++ b/doc/TODO
@@ -1,4 +1,6 @@
 * heri-eval:
+  - heri-eval -T: target class
   - Repeated random sub-sampling
     heri-eval -t 10 -r 60 ...
   - Alternative formats (crfsuite) for heri-split
+  - Support for IE (no classes, just information extraction)
diff --git a/scripts/heri-eval b/scripts/heri-eval
index be1ed8a..1a26193 100755
--- a/scripts/heri-eval
+++ b/scripts/heri-eval
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 # Copyright (c) 2015 Alexandra Figlovskaya <fglval@gmail.com>
-# Copyright (c) 2015 Aleksey Cheusov <vle@gmx.net>
+# Copyright (c) 2015-2017 Aleksey Cheusov <vle@gmx.net>
 #
 # Permission is hereby granted, free of charge, to any person obtaining
 # a copy of this software and associated documentation files (the
@@ -25,6 +25,11 @@
 # variables settable by user
 : ${SVM_TRAIN_CMD:=svm-train}
 : ${SVM_PREDICT_CMD:=svm-predict}
+
+: ${SVM_HERI_STAT_CMD:=heri-stat}
+: ${SVM_HERI_STAT_ADDONS_CMD:=heri-stat-addons}
+: ${SVM_HERI_SPLIT_CMD:=heri-split}
+
 : ${TMPDIR:=/tmp}
 
 ############################################################
@@ -52,6 +57,30 @@ on_exit(){
     fi
 }
 
+calculate_feature_count (){
+    awk '{
+	for (i=2; i <= NF; ++i) {
+	    if ($i + 0 > m)
+		m = $i + 0
+	}
+    }
+    END {
+	print m+1
+    }' "$@"
+}
+
+calculate_feature_count (){
+    awk '{
+	for (i=2; i <= NF; ++i) {
+	    if ($i + 0 > m)
+		m = $i + 0
+	}
+    }
+    END {
+	print m+1
+    }' "$@"
+}
+
 results_from_testing_sets (){
     if ! test -s "$tmp_dir/testing_fold.txt"; then
 	cat "$tmp_dir/result_single1.txt"
@@ -128,38 +157,37 @@ usage(){
 usage: heri-eval [OPTIONS] training_set [-- SVM_TRAIN_OPTIONS]
 Examples:
     heri-eval -n5 dataset.libsvm                # 5-fold cross-validation
-    heri-eval -e testing.libsvm dataset.libsvm  # testing on testing.libsvm
+    heri-eval -e testing.libsvm training.libsvm  # testing on testing.libsvm
 
 OPTIONS:
-      -h                   help message
+      -h                   Help message
 
-      -n N                 N-fold cross validation mode (mandatory option)
-      -t T                 T*N-fold cross validation mode (1 by default)
+      -n N                 The number of folds for T*N-fold cross-validation
+      -t T                 The number of runs  for T*N-fold cross-validation
 
-      -e testing_set       testing set for hold-out method
+      -e testing_set       Testing set for hold-out
 
-      -o <filename>        save results from testing sets
+      -o <filename>        Save results from testing sets
                            to the specified file
                            (golden_tag result_tag [score])
-      -O <filename>        save incorrectly classified objects
+      -O <filename>        Save incorrectly classified objects
                            to the specified file
                            (#object_number: golden_tag result_tag [score])
-      -m <filename>        save confusion matrix to the specified file
+      -m <filename>        Save confusion matrix to the specified file
                            (frequency : golden_tag result_tag)
 
       -f                   Enable output of per-fold statistics (see -Mf)
-      -M <chars>           output mode:
+      -M <chars>           Output mode:
                               t -- output total statistics,
                               f -- output per-fold statistics,
                               c -- output cross-fold statistics.
-                           The default is "-M tc".
-      -p <stat_opts>       options passed to heri-stat(1)
-      -S <seed>            seed pseudo-random generator used for splitting
-                           dataset into traing and testing parts.
-                           The default is empty, which means
-                           'split dataset randomly every invocation'
-      -K                   keep temporary directory after exiting
-      -D                   debugging mode, implies -K
+      -s <split_opts>      Options passed to heri-split(1)
+      -p <stat_opts>       Options passed to heri-stat(1)
+      -S <seed>            Seed value passed to heri-split(1).
+                           If it is not specified, the dataset is splitted
+                           into training and testing datasets randomly.
+      -K                   Keep temporary directory after exiting
+      -D                   Debugging mode, implies -K
 
 SVM_TRAIN_OPTIONS: options passed to svm-train(1) and alike
 
@@ -181,7 +209,7 @@ EOF
 runs=1
 output_mode=tc
 times=1
-while getopts De:fhKm:M:n:o:O:p:S:t: f; do
+while getopts De:fhKm:M:n:o:O:p:s:S:t: f; do
     case "$f" in
 	'?')
 	    usage
@@ -201,6 +229,8 @@ while getopts De:fhKm:M:n:o:O:p:S:t: f; do
 	    results="$OPTARG";;
 	O)
 	    incorrect_results="$OPTARG";;
+	s)
+	    herisplit_args="$herisplit_args $OPTARG";;
 	p)
 	    heristat_args="$heristat_args $OPTARG";;
 	f)
@@ -247,7 +277,7 @@ tmp_dir=`mktemp -d $TMPDIR/svm.XXXXXX`
 
 training_testing (){
     if test -n "$number_of_folds"; then
-	heri-split -c "$number_of_folds" -d "$tmp_dir" -s "$seed" $files
+	${SVM_HERI_SPLIT_CMD} $herisplit_args -c "$number_of_folds" -d "$tmp_dir" -s "$seed" $files
 	if test -n "$seed"; then
 	    seed="${seed}9876"
 	fi
@@ -287,12 +317,13 @@ show_stat (){
 	    awk '{print $1}' "$tmp_dir/test${t}_$i.txt" > "$tmp_dir/golden_tags${t}_${i}"
 	    if [[ "_$output_mode" =~ f ]]; then
 		echo "Fold ${t}x$i statistics"
-		heri-stat $heristat_args \
+		${SVM_HERI_STAT_CMD} $heristat_args \
 		    "$tmp_dir/golden_tags${t}_${i}" "$tmp_dir/result${t}_${i}.txt" |
 		indent2
 		echo ''
 	    fi
-	    heri-stat -R "$tmp_dir/golden_tags${t}_${i}" "$tmp_dir/result${t}_${i}.txt" \
+	    ${SVM_HERI_STAT_CMD} -R \
+		"$tmp_dir/golden_tags${t}_${i}" "$tmp_dir/result${t}_${i}.txt" \
 		> "$tmp_dir/evaluation${t}_${i}.txt"
 	    paste "$tmp_dir/golden_tags${t}_${i}" "$tmp_dir/result${t}_${i}.txt" | \
 		tr '	' ' '  > "$tmp_dir/result_single${t}_${i}.txt"
@@ -303,6 +334,8 @@ show_stat (){
     done
 }
 
+export HERISVM_FC=`calculate_feature_count $files`
+
 for t in `seq $times`; do
     training_testing "$@"
 #    ls -l "$tmp_dir/"
@@ -341,11 +374,11 @@ fi
 #
 if [[ "_$output_mode" =~ t ]]; then
     echo 'Total statistics'
-    heri-stat -1 $heristat_args "$tmp_dir"/result_single*_*.txt | indent2
+    ${SVM_HERI_STAT_CMD} -1 $heristat_args "$tmp_dir"/result_single*_*.txt | indent2
     echo ''
 fi
 
 if test -n "$number_of_folds" && [[ "_$output_mode" =~ c ]]; then
     echo 'Total cross-folds statistics'
-    heri-stat-addons "$tmp_dir"/evaluation*.txt | indent2
+    ${SVM_HERI_STAT_ADDONS_CMD} "$tmp_dir"/evaluation*.txt | indent2
 fi
diff --git a/scripts/heri-eval.pod b/scripts/heri-eval.pod
index 46575f3..222e61c 100644
--- a/scripts/heri-eval.pod
+++ b/scripts/heri-eval.pod
@@ -28,15 +28,16 @@ Enable output of per-fold statistics. See B<-M>I<f>.
 
 =item B<-n> I<N>
 
-I<N>-fold cross validation mode (mandatory option).
+Enable T*I<N>-fold cross-validation mode and set the number of folds to I<N>.
 
 =item B<-t> I<T>
 
-I<T>*N-fold cross validation mode (1 by default).
+Enable I<T>*N-fold cross-validation mode and set the number of runs to I<T>
+which 1 by default.
 
-=item B<-e> I<testing set>
+=item B<-e> I<testing_dataset>
 
-Sets the testing dataset.
+Enable hold-out mode and set the testing dataset.
 
 =item B<-o> I<filename>
 
@@ -60,7 +61,11 @@ Format: frequency : golden_class result_class
 
 =item B<-p> I<opts>
 
-Pass the specified I<opts> to B<heri-stat(1)>
+Pass the specified I<opts> to B<heri-stat(1)>.
+
+=item B<-s> I<opts>
+
+Pass the specified I<opts> to B<heri-split(1)>.
 
 =item B<-M> I<chars>
 
@@ -98,6 +103,18 @@ Training utility, e.g., liblinear-train
 Predicting utility, e.g., liblinear-predict
 (the default is svm-predict).
 
+=item I<SVM_HERI_STAT_CMD>
+
+Utility for calculating statistics (the default is B<heri-stat(1)>).
+
+=item I<SVM_HERI_STAT_ADDONS_CMD>
+
+Utility for calculating additional statistics (the default is B<heri-stat-addons(1)>).
+
+=item I<SVM_HERI_SPLIT_CMD>
+
+Utility for splitting the dataset (the default is B<heri-split(1)>).
+
 =item I<TMPDIR>
 
 Temporary directory (the default is /tmp).
diff --git a/scripts/heri-split b/scripts/heri-split
index 0c4381b..91b1c77 100755
--- a/scripts/heri-split
+++ b/scripts/heri-split
@@ -1,7 +1,7 @@
 #!/usr/bin/env ruby
 
 # Copyright (c) 2015 Alexandra Figlovskaya <fglval@gmail.com>
-# Copyright (c) 2015 Aleksey Cheusov <vle@gmx.net>
+# Copyright (c) 2015-2017 Aleksey Cheusov <vle@gmx.net>
 #
 # Permission is hereby granted, free of charge, to any person obtaining
 # a copy of this software and associated documentation files (the
@@ -24,10 +24,11 @@
 
 require 'optparse'
 
-options = {}
-fold_cnt = nil
-tmp_dir = nil
-seed = Random.new_seed
+$options = {}
+$fold_cnt = nil
+$tmp_dir = nil
+$seed = Random.new_seed
+$stratified = true
 
 OptionParser.new do |opts|
   opts.banner = <<EOF
@@ -44,94 +45,134 @@ EOF
   end
 
   opts.on("-cFOLD_CNT", "--folds=FOLD_CNT", "A number if folds (mandatory option)") do |c|
-    fold_cnt = c.to_i
+    $fold_cnt = c.to_i
   end
 
   opts.on("-dDIR", "--output-dir=DIR", "Output directory (mandatory option)") do |d|
-    tmp_dir = d
+    $tmp_dir = d
   end
 
   opts.on("-sSEED", "--seed=SEED", "Seed for pseudo-random number generator") do |s|
     if s != "" then
-      seed = s.to_i
+      $seed = s.to_i
     end
   end
 
+  opts.on("-r", "--random", "Use random split instead of stratified") do
+    $stratified = false
+  end
+
   opts.separator " "
 end.parse!
 
 
-if tmp_dir == nil or fold_cnt == nil then
+if $tmp_dir == nil or $fold_cnt == nil then
   STDERR.puts("Options -c and -d are mandatory, see heri-split -h for details")
   exit(1)
- end
-
-rnd = Random.new(seed)
+end
 
-#############################################
-#### to create ranges
+$rnd = Random.new($seed)
 
-files_test = []
-files_train = []
-testing_fold =  File.open(tmp_dir+"/testing_fold.txt", 'w')
-(1..fold_cnt).each do |i|
+# same as in StratifiedSplitter
+$files_test = []
+$files_train = []
+$testing_fold =  File.open($tmp_dir+"/testing_fold.txt", 'w:ASCII-8BIT')
+(1..$fold_cnt).each do |i|
   name_train = "train" + "#{i.to_i}"
   name_test = "test" + "#{i.to_i}"
-  files_test << File.open(tmp_dir+"/"+name_test+".txt", 'w')
-  files_train << File.open(tmp_dir+ "/"+ name_train+".txt", 'w')
+  $files_test << File.open($tmp_dir+"/"+name_test+".txt", 'w:ASCII-8BIT')
+  $files_train << File.open($tmp_dir+ "/"+ name_train+".txt", 'w:ASCII-8BIT')
 end
 
-#############################################
-####
+def random_split()
+  nums = []
+  curr_number = 0
+  ARGV.each do |fn|
+    File.open(fn, "r:ASCII-8BIT").each_line do |line|
+      if line =~ /^([^\s]+)\s/
+        nums << curr_number % $fold_cnt
+        curr_number += 1
+      end
+    end
+  end
+
+  nums.shuffle!(random: $rnd)
+
+  curr_number = 0
+  ARGV.each do |fn|
+    File.open(fn, "r:ASCII-8BIT").each_line do |line|
+      if line =~ /^([^\s]+)\s/
+        fold_num = nums[curr_number]
+        $fold_cnt.times do |n|
+          if fold_num == n
+            $files_test[n].puts line
+            $testing_fold.puts n+1
+          else
+            $files_train[n].puts line
+          end
+        end
 
-classes = Hash.new(0)
-ARGV.each do |fn|
-  File.open(fn).each_line do |line|
-    if line =~ /^([^\s]+)\s/
-      classes[$1] += 1
+        curr_number += 1
+      end
     end
   end
 end
-classes_arr = {}
-classes.each do |x, y|
-  arr = []
-  i = 1
-  while i <= y
-    arr << i
-    i +=1
+
+def stratified_split()
+  classes = Hash.new(0)
+  ARGV.each do |fn|
+    File.open(fn, "r:ASCII-8BIT").each_line do |line|
+      if line =~ /^([^\s]+)\s/
+        classes[$1] += 1
+      end
+    end
   end
-  cnt = (( y / fold_cnt.to_f ) ).to_i
-  arr = arr.shuffle(random: rnd)
-  classes_arr [x] = {}
-  arr.each_index do |i|
-    fold_train = (i * fold_cnt.to_f / arr.size).to_i
-    classes_arr[x][arr[i]] = fold_train
+  classes_arr = {}
+  classes.each do |x, y|
+    arr = []
+    i = 1
+    while i <= y
+      arr << i
+      i +=1
+    end
+    cnt = (( y / $fold_cnt.to_f ) ).to_i
+    arr.shuffle!(random: $rnd)
+    classes_arr [x] = {}
+    arr.each_index do |i|
+      fold_train = (i * $fold_cnt.to_f / arr.size).to_i
+      classes_arr[x][arr[i]] = fold_train
+    end
   end
-end
 
-num_line = Hash.new(0)
-ARGV.each do |fn|
-  File.open(fn).each_line do |line|
-    if line =~ /^([^\s]+)\s/
-      num_line[$1] += 1
-      curr_number = num_line[$1]
-      fold_cnt.times do |n|
-        if classes_arr[$1][curr_number] == n
-          files_test[classes_arr[$1][curr_number]].puts line
-          testing_fold.puts n+1
-        else
-          files_train[n].puts line
+  num_line = Hash.new(0)
+  ARGV.each do |fn|
+    File.open(fn, "r:ASCII-8BIT").each_line do |line|
+      if line =~ /^([^\s]+)\s/
+        num_line[$1] += 1
+        curr_number = num_line[$1]
+        $fold_cnt.times do |n|
+          if classes_arr[$1][curr_number] == n
+            $files_test[n].puts line
+            $testing_fold.puts n+1
+          else
+            $files_train[n].puts line
+          end
         end
       end
     end
   end
 end
 
+if $stratified
+  stratified_split()
+else
+  random_split()
+end
 
-files_test.each { |x|
+$files_test.each { |x|
   x.close
 }
-files_train.each { |x|
+$files_train.each { |x|
   x.close
 }
-testing_fold.close
+$testing_fold.close
diff --git a/scripts/heri-split.pod b/scripts/heri-split.pod
index f213f6b..b7e937e 100644
--- a/scripts/heri-split.pod
+++ b/scripts/heri-split.pod
@@ -10,7 +10,9 @@ B<heri-split> [OPTIONS] I<dataset1> [I<dataset2>...]
 
 B<heri-split> splits the dataset into several training and testing
 sets as it is required for N-fold cross-validation. Dataset contains
-one object per line as in svmlight/libsvm formats.
+one object per line as in svmlight format. By default
+stratified sampling is used. That is, all folds contain
+the same number of objects for each label.
 
 =head1 OPTIONS
 
@@ -22,15 +24,19 @@ Display help information.
 
 =item B<-c, --folds> I<count>
 
-Sets a number if folds. This is a mandatory option.
+Set the number of folds. This is a mandatory option.
 
 =item B<-d, --output-dir> I<dir>
 
-Sets the output directory. This is a mandatory option.
+Set the output directory. This is a mandatory option.
+
+=item B<-r,--random>
+
+Use random sampling instead of stratified one.
 
 =item B<-s, --seed> I<seed>
 
-Sets the seed value for pseudorandom generator.
+Set the seed value for pseudorandom generator.
 
 =back
 
diff --git a/scripts/heri-stat b/scripts/heri-stat
index ebaf700..ff7f819 100755
--- a/scripts/heri-stat
+++ b/scripts/heri-stat
@@ -68,7 +68,7 @@ def pretty_div(a, b)
 end
 
 def normalize_tag(tag)
-  tag = tag.sub(/^[+]/, "") # +1 => 1
+  tag = tag.to_s.sub(/^[+]/, "") # +1 => 1
   if tag =~ /^-?[0-9]+[.][0-9]+$/
     tag = tag.sub(/[.]0+$/, "") # -1.0000 => -1
   end
@@ -87,6 +87,7 @@ def split_into_3(line, fn)
   when 3
     ret = [normalize_tag(tokens[0]), normalize_tag(tokens[1]), tokens[2].to_f]
   else
+    ret = [normalize_tag(tokens[0]), normalize_tag(tokens[1]), Float::MAX]
     line.sub!(/^fake ?/, "")
     STDERR.puts("Bad line '#{line}' in file '#{fn}'")
     @err = 1
@@ -215,7 +216,7 @@ all_tp = 0
 all_f1 = 0
 res_tag2TP_cnt = tag2TP_cnt.sort_by { |key, value| key }
 res_tag2TP_cnt.each do |t, tp|
-  p = (tag2result_cnt[t] > 0.0  ?  tp.to_f / tag2result_cnt[t]  :  1.0)
+  p = (tag2result_cnt[t] > 0.0  ?  tp.to_f / tag2result_cnt[t]  :  0.0)
   r = (tag2golden_cnt[t] > 0.0  ?  tp.to_f / tag2golden_cnt[t]  :  0.0)
   f1 = (p+r > 0.0  ?  2*p*r / (p+r)  :  0.0)
   if !@options[:statistics]
diff --git a/scripts/heri-stat-addons b/scripts/heri-stat-addons
index 23e1617..6220931 100755
--- a/scripts/heri-stat-addons
+++ b/scripts/heri-stat-addons
@@ -120,7 +120,7 @@ FIELDS.each do |f, f_to_print|
   pairs.each do |ft|
     max_dev = max_deviation [ft[0]][ft[1]]
     std_dev = std_deviation [ft[0]][ft[1]]
-    max_dev = "%-5.3g%" % [max_dev*100]
+    max_dev = "%-5.3g" % [max_dev*100]
     std_dev = "%-5.3g" % [std_dev*100]
     print_value(ft[1], "max/std deviation(" + f_to_print + ")", max_dev, std_dev)
   end
diff --git a/scripts/heri-stat.pod b/scripts/heri-stat.pod
index 061784a..6f0e77b 100644
--- a/scripts/heri-stat.pod
+++ b/scripts/heri-stat.pod
@@ -1,7 +1,6 @@
 =head1 NAME
 
-heri-stat - calculates precision, recall, F1
-and some other things for given golden data and predictions.
+heri-stat - calculates precision, recall, F1 and some other things
 
 =head1 SYNOPSIS
 
@@ -51,7 +50,7 @@ Disable output of accuracy.
 
 =item B<-1, --single>
 
-2 or 3 tokens per line are expected on input
+2 or 3 tokens per line are expected on input.
 
 =item B<-u, --unclassified> I<label>
 
diff --git a/tests/dataset1.txt b/tests/dataset1.txt
index 76b8e4a..7dd27d0 100644
--- a/tests/dataset1.txt
+++ b/tests/dataset1.txt
@@ -1,9 +1,9 @@
-1 features1
-2 features2
-3 features3
-4 features4
-5 features5
-6 features6
-7 features7
-8 features8
-9 features9
+1 свойство1
+2 свойство2
+3 свойство3
+4 свойство4
+5 свойство5
+6 свойство6
+7 свойство7
+8 свойство8
+9 свойство9
diff --git a/tests/test_heri-eval.sh b/tests/test_heri-eval.sh
index c59d564..d1a1df1 100644
--- a/tests/test_heri-eval.sh
+++ b/tests/test_heri-eval.sh
@@ -292,9 +292,9 @@ env SVM_TRAIN_CMD=test_train SVM_PREDICT_CMD=test_predict \
 cmp 'heri-eval #10.1 -- options' \
 'Total statistics
   Class  0      P, R, F1:  0.5797    40/69     ,  1         40/40     ,  0.7339
-  Class  1      P, R, F1:  1          0/0      ,  0          0/29     ,  0     
+  Class  1      P, R, F1:  0          0/0      ,  0          0/29     ,  0     
   Accuracy              :  0.5797    40/69     
-  Macro average P, R, F1:  0.7899              ,  0.5                 ,  0.367 
+  Macro average P, R, F1:  0.2899              ,  0.5                 ,  0.367 
 
 '
 
@@ -302,9 +302,20 @@ env SVM_TRAIN_CMD=test_train SVM_PREDICT_CMD=test_predict \
     heri-eval -Mt -n2 matrix.libsvm -- -1 2>&1 |
 cmp 'heri-eval #10.2 -- options' \
 'Total statistics
-  Class  0      P, R, F1:  1          0/0      ,  0          0/40     ,  0     
+  Class  0      P, R, F1:  0          0/0      ,  0          0/40     ,  0     
   Class  1      P, R, F1:  0.4203    29/69     ,  1         29/29     ,  0.5918
   Accuracy              :  0.4203    29/69     
-  Macro average P, R, F1:  0.7101              ,  0.5                 ,  0.2959
+  Macro average P, R, F1:  0.2101              ,  0.5                 ,  0.2959
+
+'
+
+env SVM_TRAIN_CMD=test_train SVM_PREDICT_CMD=test_predict \
+    heri-eval -Mt -s '-r' -S117 -n2 matrix.libsvm -- -1 2>&1 |
+cmp 'heri-eval #10.3 -- options' \
+'Total statistics
+  Class  0      P, R, F1:  0          0/0      ,  0          0/40     ,  0     
+  Class  1      P, R, F1:  0.4203    29/69     ,  1         29/29     ,  0.5918
+  Accuracy              :  0.4203    29/69     
+  Macro average P, R, F1:  0.2101              ,  0.5                 ,  0.2959
 
 '
diff --git a/tests/test_heri-split.sh b/tests/test_heri-split.sh
index 9d96560..0caf09b 100644
--- a/tests/test_heri-split.sh
+++ b/tests/test_heri-split.sh
@@ -1,4 +1,4 @@
-# heri-split -- no tests yet
+# -*- coding: utf-8 -*-
 
 dataset="$tmpdir/dataset"
 res_dir="$tmpdir/dir1"
@@ -34,6 +34,35 @@ exit status=1
 '
 
 generate_random_dataset
+
+rm -rf "$res_dir"/*
+
+{ heri-split -rc2 -d "$res_dir" "$dataset" 2>&1; echo "exit status=$?"; } |
+cmp 'heri-split -r #2 exit code' \
+'exit status=0
+'
+
+ls -1 "$res_dir" | sort |
+cmp 'heri-split -r #3 result files' \
+'test1.txt
+test2.txt
+testing_fold.txt
+train1.txt
+train2.txt
+'
+
+for i in 1 2; do
+    { cat "$res_dir/test${i}.txt" "$res_dir/train${i}.txt" | sort -k3,3n; } |
+	cmp2 "heri-split -r #4.${i} all objects" \
+	     "$dataset"
+done
+
+{ cat "$res_dir/"test?.txt | sort -k3,3n; } |
+cmp2 "heri-split -r #5 testing sets correctness" \
+     "$dataset"
+
+rm -rf "$res_dir"/*
+
 { heri-split -c 3 -d "$res_dir" "$dataset" 2>&1; echo "exit status=$?"; } |
 cmp 'heri-split #2 exit code' \
 'exit status=0
@@ -61,9 +90,18 @@ cmp2 "heri-split #5 testing sets correctness" \
      "$dataset"
 
 rm "$res_dir"/*
+heri-split -r -d "$res_dir" -c 4 dataset1.txt
+val1=`cat $res_dir/test1.txt $res_dir/test2.txt $res_dir/test3.txt $res_dir/test4.txt`
+val2=`awk '{printf "%d %d свойство%d\n", $1, NR, NR}' $res_dir/testing_fold.txt |
+   sort -k1,1n -k2,2n |
+   awk '{print $2, $3}'`
+printf '%s' "$val1" | cmp "heri-split -r #6 correct testing_fold.txt" \
+     "$val2"
+
+rm "$res_dir"/*
 heri-split -d "$res_dir" -c 4 dataset1.txt
 val1=`cat $res_dir/test1.txt $res_dir/test2.txt $res_dir/test3.txt $res_dir/test4.txt`
-val2=`awk '{printf "%d %d features%d\n", $1, NR, NR}' $res_dir/testing_fold.txt |
+val2=`awk '{printf "%d %d свойство%d\n", $1, NR, NR}' $res_dir/testing_fold.txt |
    sort -k1,1n -k2,2n |
    awk '{print $2, $3}'`
 printf '%s' "$val1" | cmp "heri-split #6 correct testing_fold.txt" \
@@ -74,7 +112,7 @@ heri-split -d "$res_dir" -c 9 dataset2.txt
 for i in 1 2 3 4 5 6 7 8 9; do
     wc -l "$res_dir/test$i.txt" | awk '{print $1}'
 done |
-cmp "heri-split #7 correct stratified splitting" \
+cmp "heri-split #7 correct stratified sampling" \
     '1
 1
 1
@@ -87,6 +125,18 @@ cmp "heri-split #7 correct stratified splitting" \
 '
 
 rm "$res_dir"/*
+heri-split -r -d "$res_dir" -c 4 dataset2.txt
+for i in 1 2 3 4; do
+    wc -l "$res_dir/test$i.txt" | awk '{print $1}'
+done |
+cmp "heri-split #7 correct random sampling" \
+    '3
+2
+2
+2
+'
+
+rm "$res_dir"/*
 heri-split -d "$res_dir" -c 2 dataset3.txt
 for j in 1 2; do
     echo "dataset: $j"
@@ -95,7 +145,7 @@ for j in 1 2; do
 	    END {printf("tag %s -> %s\n", tag, cnt)}' "$res_dir/test$j.txt"
     done
 done |
-cmp "heri-split #8 correct stratified splitting" \
+cmp "heri-split #8 correct stratified sampling" \
     'dataset: 1
 tag 1 -> 1
 tag 2 -> 1
@@ -118,7 +168,7 @@ for i in 0 1 2 3 4 5 6 7 8 9; do
 	    END {print cnt}' "$res_dir/train$j.txt"
     done | sort | awk '{ma = $1} NR == 1 {mi = $1} END {print ((ma - mi) <= 1)}'
 done |
-cmp "heri-split #9 correct stratified splitting" \
+cmp "heri-split #9 correct stratified sampling" \
 'tag: 0
 1
 tag: 1
diff --git a/tests/test_heri-stat.sh b/tests/test_heri-stat.sh
index 78f3c6c..d32b438 100644
--- a/tests/test_heri-stat.sh
+++ b/tests/test_heri-stat.sh
@@ -122,10 +122,10 @@ Macro average	f1	0.NNNN
 heri-stat golden3.txt result3.txt 2>&1 |
 remove_fractions |
 cmp 'heri-stat #8 all equal' \
-'Class  A      P, R, F1:  1          0/0      ,  0          0/6      ,  0     
+'Class  A      P, R, F1:  0          0/0      ,  0          0/6      ,  0     
 Class  B      P, R, F1:  0          0/6      ,  0          0/0      ,  0     
 Accuracy              :  0          0/6      
-Macro average P, R, F1:  0.NNNN                 ,  0                   ,  0     
+Macro average P, R, F1:  0                   ,  0                   ,  0     
 '
 
 heri-stat golden3.txt /dev/null 2>&1 |
@@ -201,10 +201,10 @@ Macro average P, R, F1:  1                   ,  0.75                ,  0.8439
 
 heri-stat -t 0.93 golden5.txt result5_prob.txt 2>&1 |
 cmp 'heri-stat #13.5 -t 0.93' \
-'Class  A      P, R, F1:  1          0/0      ,  0          0/2      ,  0     
-Class  B      P, R, F1:  1          0/0      ,  0          0/3      ,  0     
+'Class  A      P, R, F1:  0          0/0      ,  0          0/2      ,  0     
+Class  B      P, R, F1:  0          0/0      ,  0          0/3      ,  0     
 Class  C      P, R, F1:  1          3/3      ,  0.75       3/4      ,  0.8571
 Class  E      P, R, F1:  1          3/3      ,  0.5        3/6      ,  0.6667
 Micro average P, R, F1:  1          6/6      ,  0.4        6/15     ,  0.5714
-Macro average P, R, F1:  1                   ,  0.3125              ,  0.381 
+Macro average P, R, F1:  0.5                 ,  0.3125              ,  0.381 
 '
author	Andrew Shadura <andrewsh@debian.org>	2018-03-27 09:00:23 +0200
committer	Andrew Shadura <andrewsh@debian.org>	2018-03-27 09:00:23 +0200
commit	3a7a7cb2ec4965de7a7125ff3fbcb19a75847fcd (patch)
tree	5ba8acf931b8d803b31e7d6b4ff9ddbc3dbbbabb
parent	dcbd9ccf5345ff85848df806a50ac4b2c08aaa5f (diff)
parent	9d064c7bcd947481e225c9693463e28fa281e1c6 (diff)