summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Shadura <andrewsh@debian.org>2018-03-27 09:00:23 +0200
committerAndrew Shadura <andrewsh@debian.org>2018-03-27 09:00:23 +0200
commit3a7a7cb2ec4965de7a7125ff3fbcb19a75847fcd (patch)
tree5ba8acf931b8d803b31e7d6b4ff9ddbc3dbbbabb
parentdcbd9ccf5345ff85848df806a50ac4b2c08aaa5f (diff)
parent9d064c7bcd947481e225c9693463e28fa281e1c6 (diff)
Merge tag 'herisvm-0.8.1' into dgit/sid
-rw-r--r--doc/NEWS8
-rw-r--r--doc/TODO2
-rwxr-xr-xscripts/heri-eval81
-rw-r--r--scripts/heri-eval.pod27
-rwxr-xr-xscripts/heri-split153
-rw-r--r--scripts/heri-split.pod14
-rwxr-xr-xscripts/heri-stat5
-rwxr-xr-xscripts/heri-stat-addons2
-rw-r--r--scripts/heri-stat.pod5
-rw-r--r--tests/dataset1.txt18
-rw-r--r--tests/test_heri-eval.sh19
-rw-r--r--tests/test_heri-split.sh60
-rw-r--r--tests/test_heri-stat.sh10
13 files changed, 285 insertions, 119 deletions
diff --git a/doc/NEWS b/doc/NEWS
index db74a4c..f20870a 100644
--- a/doc/NEWS
+++ b/doc/NEWS
@@ -1,3 +1,9 @@
======================================================================
+Version 0.8.1, Thu, 15 Mar 2018 15:28:46 +0300
+
+ heri-stat-addons: format string for max_dev was fixed
+
+======================================================================
Version 0.1.0, Sat, 13 Jun 2015 12:53:02 +0300
- initial publicly available release
+
+ initial publicly available release
diff --git a/doc/TODO b/doc/TODO
index 7c9dbaa..21da261 100644
--- a/doc/TODO
+++ b/doc/TODO
@@ -1,4 +1,6 @@
* heri-eval:
+ - heri-eval -T: target class
- Repeated random sub-sampling
heri-eval -t 10 -r 60 ...
- Alternative formats (crfsuite) for heri-split
+ - Support for IE (no classes, just information extraction)
diff --git a/scripts/heri-eval b/scripts/heri-eval
index be1ed8a..1a26193 100755
--- a/scripts/heri-eval
+++ b/scripts/heri-eval
@@ -1,7 +1,7 @@
#!/usr/bin/env bash
# Copyright (c) 2015 Alexandra Figlovskaya <fglval@gmail.com>
-# Copyright (c) 2015 Aleksey Cheusov <vle@gmx.net>
+# Copyright (c) 2015-2017 Aleksey Cheusov <vle@gmx.net>
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
@@ -25,6 +25,11 @@
# variables settable by user
: ${SVM_TRAIN_CMD:=svm-train}
: ${SVM_PREDICT_CMD:=svm-predict}
+
+: ${SVM_HERI_STAT_CMD:=heri-stat}
+: ${SVM_HERI_STAT_ADDONS_CMD:=heri-stat-addons}
+: ${SVM_HERI_SPLIT_CMD:=heri-split}
+
: ${TMPDIR:=/tmp}
############################################################
@@ -52,6 +57,30 @@ on_exit(){
fi
}
+calculate_feature_count (){
+ awk '{
+ for (i=2; i <= NF; ++i) {
+ if ($i + 0 > m)
+ m = $i + 0
+ }
+ }
+ END {
+ print m+1
+ }' "$@"
+}
+
+calculate_feature_count (){
+ awk '{
+ for (i=2; i <= NF; ++i) {
+ if ($i + 0 > m)
+ m = $i + 0
+ }
+ }
+ END {
+ print m+1
+ }' "$@"
+}
+
results_from_testing_sets (){
if ! test -s "$tmp_dir/testing_fold.txt"; then
cat "$tmp_dir/result_single1.txt"
@@ -128,38 +157,37 @@ usage(){
usage: heri-eval [OPTIONS] training_set [-- SVM_TRAIN_OPTIONS]
Examples:
heri-eval -n5 dataset.libsvm # 5-fold cross-validation
- heri-eval -e testing.libsvm dataset.libsvm # testing on testing.libsvm
+ heri-eval -e testing.libsvm training.libsvm # testing on testing.libsvm
OPTIONS:
- -h help message
+ -h Help message
- -n N N-fold cross validation mode (mandatory option)
- -t T T*N-fold cross validation mode (1 by default)
+ -n N The number of folds for T*N-fold cross-validation
+ -t T The number of runs for T*N-fold cross-validation
- -e testing_set testing set for hold-out method
+ -e testing_set Testing set for hold-out
- -o <filename> save results from testing sets
+ -o <filename> Save results from testing sets
to the specified file
(golden_tag result_tag [score])
- -O <filename> save incorrectly classified objects
+ -O <filename> Save incorrectly classified objects
to the specified file
(#object_number: golden_tag result_tag [score])
- -m <filename> save confusion matrix to the specified file
+ -m <filename> Save confusion matrix to the specified file
(frequency : golden_tag result_tag)
-f Enable output of per-fold statistics (see -Mf)
- -M <chars> output mode:
+ -M <chars> Output mode:
t -- output total statistics,
f -- output per-fold statistics,
c -- output cross-fold statistics.
- The default is "-M tc".
- -p <stat_opts> options passed to heri-stat(1)
- -S <seed> seed pseudo-random generator used for splitting
- dataset into traing and testing parts.
- The default is empty, which means
- 'split dataset randomly every invocation'
- -K keep temporary directory after exiting
- -D debugging mode, implies -K
+ -s <split_opts> Options passed to heri-split(1)
+ -p <stat_opts> Options passed to heri-stat(1)
+ -S <seed> Seed value passed to heri-split(1).
+ If it is not specified, the dataset is splitted
+ into training and testing datasets randomly.
+ -K Keep temporary directory after exiting
+ -D Debugging mode, implies -K
SVM_TRAIN_OPTIONS: options passed to svm-train(1) and alike
@@ -181,7 +209,7 @@ EOF
runs=1
output_mode=tc
times=1
-while getopts De:fhKm:M:n:o:O:p:S:t: f; do
+while getopts De:fhKm:M:n:o:O:p:s:S:t: f; do
case "$f" in
'?')
usage
@@ -201,6 +229,8 @@ while getopts De:fhKm:M:n:o:O:p:S:t: f; do
results="$OPTARG";;
O)
incorrect_results="$OPTARG";;
+ s)
+ herisplit_args="$herisplit_args $OPTARG";;
p)
heristat_args="$heristat_args $OPTARG";;
f)
@@ -247,7 +277,7 @@ tmp_dir=`mktemp -d $TMPDIR/svm.XXXXXX`
training_testing (){
if test -n "$number_of_folds"; then
- heri-split -c "$number_of_folds" -d "$tmp_dir" -s "$seed" $files
+ ${SVM_HERI_SPLIT_CMD} $herisplit_args -c "$number_of_folds" -d "$tmp_dir" -s "$seed" $files
if test -n "$seed"; then
seed="${seed}9876"
fi
@@ -287,12 +317,13 @@ show_stat (){
awk '{print $1}' "$tmp_dir/test${t}_$i.txt" > "$tmp_dir/golden_tags${t}_${i}"
if [[ "_$output_mode" =~ f ]]; then
echo "Fold ${t}x$i statistics"
- heri-stat $heristat_args \
+ ${SVM_HERI_STAT_CMD} $heristat_args \
"$tmp_dir/golden_tags${t}_${i}" "$tmp_dir/result${t}_${i}.txt" |
indent2
echo ''
fi
- heri-stat -R "$tmp_dir/golden_tags${t}_${i}" "$tmp_dir/result${t}_${i}.txt" \
+ ${SVM_HERI_STAT_CMD} -R \
+ "$tmp_dir/golden_tags${t}_${i}" "$tmp_dir/result${t}_${i}.txt" \
> "$tmp_dir/evaluation${t}_${i}.txt"
paste "$tmp_dir/golden_tags${t}_${i}" "$tmp_dir/result${t}_${i}.txt" | \
tr ' ' ' ' > "$tmp_dir/result_single${t}_${i}.txt"
@@ -303,6 +334,8 @@ show_stat (){
done
}
+export HERISVM_FC=`calculate_feature_count $files`
+
for t in `seq $times`; do
training_testing "$@"
# ls -l "$tmp_dir/"
@@ -341,11 +374,11 @@ fi
#
if [[ "_$output_mode" =~ t ]]; then
echo 'Total statistics'
- heri-stat -1 $heristat_args "$tmp_dir"/result_single*_*.txt | indent2
+ ${SVM_HERI_STAT_CMD} -1 $heristat_args "$tmp_dir"/result_single*_*.txt | indent2
echo ''
fi
if test -n "$number_of_folds" && [[ "_$output_mode" =~ c ]]; then
echo 'Total cross-folds statistics'
- heri-stat-addons "$tmp_dir"/evaluation*.txt | indent2
+ ${SVM_HERI_STAT_ADDONS_CMD} "$tmp_dir"/evaluation*.txt | indent2
fi
diff --git a/scripts/heri-eval.pod b/scripts/heri-eval.pod
index 46575f3..222e61c 100644
--- a/scripts/heri-eval.pod
+++ b/scripts/heri-eval.pod
@@ -28,15 +28,16 @@ Enable output of per-fold statistics. See B<-M>I<f>.
=item B<-n> I<N>
-I<N>-fold cross validation mode (mandatory option).
+Enable T*I<N>-fold cross-validation mode and set the number of folds to I<N>.
=item B<-t> I<T>
-I<T>*N-fold cross validation mode (1 by default).
+Enable I<T>*N-fold cross-validation mode and set the number of runs to I<T>
+which 1 by default.
-=item B<-e> I<testing set>
+=item B<-e> I<testing_dataset>
-Sets the testing dataset.
+Enable hold-out mode and set the testing dataset.
=item B<-o> I<filename>
@@ -60,7 +61,11 @@ Format: frequency : golden_class result_class
=item B<-p> I<opts>
-Pass the specified I<opts> to B<heri-stat(1)>
+Pass the specified I<opts> to B<heri-stat(1)>.
+
+=item B<-s> I<opts>
+
+Pass the specified I<opts> to B<heri-split(1)>.
=item B<-M> I<chars>
@@ -98,6 +103,18 @@ Training utility, e.g., liblinear-train
Predicting utility, e.g., liblinear-predict
(the default is svm-predict).
+=item I<SVM_HERI_STAT_CMD>
+
+Utility for calculating statistics (the default is B<heri-stat(1)>).
+
+=item I<SVM_HERI_STAT_ADDONS_CMD>
+
+Utility for calculating additional statistics (the default is B<heri-stat-addons(1)>).
+
+=item I<SVM_HERI_SPLIT_CMD>
+
+Utility for splitting the dataset (the default is B<heri-split(1)>).
+
=item I<TMPDIR>
Temporary directory (the default is /tmp).
diff --git a/scripts/heri-split b/scripts/heri-split
index 0c4381b..91b1c77 100755
--- a/scripts/heri-split
+++ b/scripts/heri-split
@@ -1,7 +1,7 @@
#!/usr/bin/env ruby
# Copyright (c) 2015 Alexandra Figlovskaya <fglval@gmail.com>
-# Copyright (c) 2015 Aleksey Cheusov <vle@gmx.net>
+# Copyright (c) 2015-2017 Aleksey Cheusov <vle@gmx.net>
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
@@ -24,10 +24,11 @@
require 'optparse'
-options = {}
-fold_cnt = nil
-tmp_dir = nil
-seed = Random.new_seed
+$options = {}
+$fold_cnt = nil
+$tmp_dir = nil
+$seed = Random.new_seed
+$stratified = true
OptionParser.new do |opts|
opts.banner = <<EOF
@@ -44,94 +45,134 @@ EOF
end
opts.on("-cFOLD_CNT", "--folds=FOLD_CNT", "A number if folds (mandatory option)") do |c|
- fold_cnt = c.to_i
+ $fold_cnt = c.to_i
end
opts.on("-dDIR", "--output-dir=DIR", "Output directory (mandatory option)") do |d|
- tmp_dir = d
+ $tmp_dir = d
end
opts.on("-sSEED", "--seed=SEED", "Seed for pseudo-random number generator") do |s|
if s != "" then
- seed = s.to_i
+ $seed = s.to_i
end
end
+ opts.on("-r", "--random", "Use random split instead of stratified") do
+ $stratified = false
+ end
+
opts.separator " "
end.parse!
-if tmp_dir == nil or fold_cnt == nil then
+if $tmp_dir == nil or $fold_cnt == nil then
STDERR.puts("Options -c and -d are mandatory, see heri-split -h for details")
exit(1)
- end
-
-rnd = Random.new(seed)
+end
-#############################################
-#### to create ranges
+$rnd = Random.new($seed)
-files_test = []
-files_train = []
-testing_fold = File.open(tmp_dir+"/testing_fold.txt", 'w')
-(1..fold_cnt).each do |i|
+# same as in StratifiedSplitter
+$files_test = []
+$files_train = []
+$testing_fold = File.open($tmp_dir+"/testing_fold.txt", 'w:ASCII-8BIT')
+(1..$fold_cnt).each do |i|
name_train = "train" + "#{i.to_i}"
name_test = "test" + "#{i.to_i}"
- files_test << File.open(tmp_dir+"/"+name_test+".txt", 'w')
- files_train << File.open(tmp_dir+ "/"+ name_train+".txt", 'w')
+ $files_test << File.open($tmp_dir+"/"+name_test+".txt", 'w:ASCII-8BIT')
+ $files_train << File.open($tmp_dir+ "/"+ name_train+".txt", 'w:ASCII-8BIT')
end
-#############################################
-####
+def random_split()
+ nums = []
+ curr_number = 0
+ ARGV.each do |fn|
+ File.open(fn, "r:ASCII-8BIT").each_line do |line|
+ if line =~ /^([^\s]+)\s/
+ nums << curr_number % $fold_cnt
+ curr_number += 1
+ end
+ end
+ end
+
+ nums.shuffle!(random: $rnd)
+
+ curr_number = 0
+ ARGV.each do |fn|
+ File.open(fn, "r:ASCII-8BIT").each_line do |line|
+ if line =~ /^([^\s]+)\s/
+ fold_num = nums[curr_number]
+ $fold_cnt.times do |n|
+ if fold_num == n
+ $files_test[n].puts line
+ $testing_fold.puts n+1
+ else
+ $files_train[n].puts line
+ end
+ end
-classes = Hash.new(0)
-ARGV.each do |fn|
- File.open(fn).each_line do |line|
- if line =~ /^([^\s]+)\s/
- classes[$1] += 1
+ curr_number += 1
+ end
end
end
end
-classes_arr = {}
-classes.each do |x, y|
- arr = []
- i = 1
- while i <= y
- arr << i
- i +=1
+
+def stratified_split()
+ classes = Hash.new(0)
+ ARGV.each do |fn|
+ File.open(fn, "r:ASCII-8BIT").each_line do |line|
+ if line =~ /^([^\s]+)\s/
+ classes[$1] += 1
+ end
+ end
end
- cnt = (( y / fold_cnt.to_f ) ).to_i
- arr = arr.shuffle(random: rnd)
- classes_arr [x] = {}
- arr.each_index do |i|
- fold_train = (i * fold_cnt.to_f / arr.size).to_i
- classes_arr[x][arr[i]] = fold_train
+ classes_arr = {}
+ classes.each do |x, y|
+ arr = []
+ i = 1
+ while i <= y
+ arr << i
+ i +=1
+ end
+ cnt = (( y / $fold_cnt.to_f ) ).to_i
+ arr.shuffle!(random: $rnd)
+ classes_arr [x] = {}
+ arr.each_index do |i|
+ fold_train = (i * $fold_cnt.to_f / arr.size).to_i
+ classes_arr[x][arr[i]] = fold_train
+ end
end
-end
-num_line = Hash.new(0)
-ARGV.each do |fn|
- File.open(fn).each_line do |line|
- if line =~ /^([^\s]+)\s/
- num_line[$1] += 1
- curr_number = num_line[$1]
- fold_cnt.times do |n|
- if classes_arr[$1][curr_number] == n
- files_test[classes_arr[$1][curr_number]].puts line
- testing_fold.puts n+1
- else
- files_train[n].puts line
+ num_line = Hash.new(0)
+ ARGV.each do |fn|
+ File.open(fn, "r:ASCII-8BIT").each_line do |line|
+ if line =~ /^([^\s]+)\s/
+ num_line[$1] += 1
+ curr_number = num_line[$1]
+ $fold_cnt.times do |n|
+ if classes_arr[$1][curr_number] == n
+ $files_test[n].puts line
+ $testing_fold.puts n+1
+ else
+ $files_train[n].puts line
+ end
end
end
end
end
end
+if $stratified
+ stratified_split()
+else
+ random_split()
+end
-files_test.each { |x|
+$files_test.each { |x|
x.close
}
-files_train.each { |x|
+$files_train.each { |x|
x.close
}
-testing_fold.close
+$testing_fold.close
diff --git a/scripts/heri-split.pod b/scripts/heri-split.pod
index f213f6b..b7e937e 100644
--- a/scripts/heri-split.pod
+++ b/scripts/heri-split.pod
@@ -10,7 +10,9 @@ B<heri-split> [OPTIONS] I<dataset1> [I<dataset2>...]
B<heri-split> splits the dataset into several training and testing
sets as it is required for N-fold cross-validation. Dataset contains
-one object per line as in svmlight/libsvm formats.
+one object per line as in svmlight format. By default
+stratified sampling is used. That is, all folds contain
+the same number of objects for each label.
=head1 OPTIONS
@@ -22,15 +24,19 @@ Display help information.
=item B<-c, --folds> I<count>
-Sets a number if folds. This is a mandatory option.
+Set the number of folds. This is a mandatory option.
=item B<-d, --output-dir> I<dir>
-Sets the output directory. This is a mandatory option.
+Set the output directory. This is a mandatory option.
+
+=item B<-r,--random>
+
+Use random sampling instead of stratified one.
=item B<-s, --seed> I<seed>
-Sets the seed value for pseudorandom generator.
+Set the seed value for pseudorandom generator.
=back
diff --git a/scripts/heri-stat b/scripts/heri-stat
index ebaf700..ff7f819 100755
--- a/scripts/heri-stat
+++ b/scripts/heri-stat
@@ -68,7 +68,7 @@ def pretty_div(a, b)
end
def normalize_tag(tag)
- tag = tag.sub(/^[+]/, "") # +1 => 1
+ tag = tag.to_s.sub(/^[+]/, "") # +1 => 1
if tag =~ /^-?[0-9]+[.][0-9]+$/
tag = tag.sub(/[.]0+$/, "") # -1.0000 => -1
end
@@ -87,6 +87,7 @@ def split_into_3(line, fn)
when 3
ret = [normalize_tag(tokens[0]), normalize_tag(tokens[1]), tokens[2].to_f]
else
+ ret = [normalize_tag(tokens[0]), normalize_tag(tokens[1]), Float::MAX]
line.sub!(/^fake ?/, "")
STDERR.puts("Bad line '#{line}' in file '#{fn}'")
@err = 1
@@ -215,7 +216,7 @@ all_tp = 0
all_f1 = 0
res_tag2TP_cnt = tag2TP_cnt.sort_by { |key, value| key }
res_tag2TP_cnt.each do |t, tp|
- p = (tag2result_cnt[t] > 0.0 ? tp.to_f / tag2result_cnt[t] : 1.0)
+ p = (tag2result_cnt[t] > 0.0 ? tp.to_f / tag2result_cnt[t] : 0.0)
r = (tag2golden_cnt[t] > 0.0 ? tp.to_f / tag2golden_cnt[t] : 0.0)
f1 = (p+r > 0.0 ? 2*p*r / (p+r) : 0.0)
if !@options[:statistics]
diff --git a/scripts/heri-stat-addons b/scripts/heri-stat-addons
index 23e1617..6220931 100755
--- a/scripts/heri-stat-addons
+++ b/scripts/heri-stat-addons
@@ -120,7 +120,7 @@ FIELDS.each do |f, f_to_print|
pairs.each do |ft|
max_dev = max_deviation [ft[0]][ft[1]]
std_dev = std_deviation [ft[0]][ft[1]]
- max_dev = "%-5.3g%" % [max_dev*100]
+ max_dev = "%-5.3g" % [max_dev*100]
std_dev = "%-5.3g" % [std_dev*100]
print_value(ft[1], "max/std deviation(" + f_to_print + ")", max_dev, std_dev)
end
diff --git a/scripts/heri-stat.pod b/scripts/heri-stat.pod
index 061784a..6f0e77b 100644
--- a/scripts/heri-stat.pod
+++ b/scripts/heri-stat.pod
@@ -1,7 +1,6 @@
=head1 NAME
-heri-stat - calculates precision, recall, F1
-and some other things for given golden data and predictions.
+heri-stat - calculates precision, recall, F1 and some other things
=head1 SYNOPSIS
@@ -51,7 +50,7 @@ Disable output of accuracy.
=item B<-1, --single>
-2 or 3 tokens per line are expected on input
+2 or 3 tokens per line are expected on input.
=item B<-u, --unclassified> I<label>
diff --git a/tests/dataset1.txt b/tests/dataset1.txt
index 76b8e4a..7dd27d0 100644
--- a/tests/dataset1.txt
+++ b/tests/dataset1.txt
@@ -1,9 +1,9 @@
-1 features1
-2 features2
-3 features3
-4 features4
-5 features5
-6 features6
-7 features7
-8 features8
-9 features9
+1 свойство1
+2 свойство2
+3 свойство3
+4 свойство4
+5 свойство5
+6 свойство6
+7 свойство7
+8 свойство8
+9 свойство9
diff --git a/tests/test_heri-eval.sh b/tests/test_heri-eval.sh
index c59d564..d1a1df1 100644
--- a/tests/test_heri-eval.sh
+++ b/tests/test_heri-eval.sh
@@ -292,9 +292,9 @@ env SVM_TRAIN_CMD=test_train SVM_PREDICT_CMD=test_predict \
cmp 'heri-eval #10.1 -- options' \
'Total statistics
Class 0 P, R, F1: 0.5797 40/69 , 1 40/40 , 0.7339
- Class 1 P, R, F1: 1 0/0 , 0 0/29 , 0
+ Class 1 P, R, F1: 0 0/0 , 0 0/29 , 0
Accuracy : 0.5797 40/69
- Macro average P, R, F1: 0.7899 , 0.5 , 0.367
+ Macro average P, R, F1: 0.2899 , 0.5 , 0.367
'
@@ -302,9 +302,20 @@ env SVM_TRAIN_CMD=test_train SVM_PREDICT_CMD=test_predict \
heri-eval -Mt -n2 matrix.libsvm -- -1 2>&1 |
cmp 'heri-eval #10.2 -- options' \
'Total statistics
- Class 0 P, R, F1: 1 0/0 , 0 0/40 , 0
+ Class 0 P, R, F1: 0 0/0 , 0 0/40 , 0
Class 1 P, R, F1: 0.4203 29/69 , 1 29/29 , 0.5918
Accuracy : 0.4203 29/69
- Macro average P, R, F1: 0.7101 , 0.5 , 0.2959
+ Macro average P, R, F1: 0.2101 , 0.5 , 0.2959
+
+'
+
+env SVM_TRAIN_CMD=test_train SVM_PREDICT_CMD=test_predict \
+ heri-eval -Mt -s '-r' -S117 -n2 matrix.libsvm -- -1 2>&1 |
+cmp 'heri-eval #10.3 -- options' \
+'Total statistics
+ Class 0 P, R, F1: 0 0/0 , 0 0/40 , 0
+ Class 1 P, R, F1: 0.4203 29/69 , 1 29/29 , 0.5918
+ Accuracy : 0.4203 29/69
+ Macro average P, R, F1: 0.2101 , 0.5 , 0.2959
'
diff --git a/tests/test_heri-split.sh b/tests/test_heri-split.sh
index 9d96560..0caf09b 100644
--- a/tests/test_heri-split.sh
+++ b/tests/test_heri-split.sh
@@ -1,4 +1,4 @@
-# heri-split -- no tests yet
+# -*- coding: utf-8 -*-
dataset="$tmpdir/dataset"
res_dir="$tmpdir/dir1"
@@ -34,6 +34,35 @@ exit status=1
'
generate_random_dataset
+
+rm -rf "$res_dir"/*
+
+{ heri-split -rc2 -d "$res_dir" "$dataset" 2>&1; echo "exit status=$?"; } |
+cmp 'heri-split -r #2 exit code' \
+'exit status=0
+'
+
+ls -1 "$res_dir" | sort |
+cmp 'heri-split -r #3 result files' \
+'test1.txt
+test2.txt
+testing_fold.txt
+train1.txt
+train2.txt
+'
+
+for i in 1 2; do
+ { cat "$res_dir/test${i}.txt" "$res_dir/train${i}.txt" | sort -k3,3n; } |
+ cmp2 "heri-split -r #4.${i} all objects" \
+ "$dataset"
+done
+
+{ cat "$res_dir/"test?.txt | sort -k3,3n; } |
+cmp2 "heri-split -r #5 testing sets correctness" \
+ "$dataset"
+
+rm -rf "$res_dir"/*
+
{ heri-split -c 3 -d "$res_dir" "$dataset" 2>&1; echo "exit status=$?"; } |
cmp 'heri-split #2 exit code' \
'exit status=0
@@ -61,9 +90,18 @@ cmp2 "heri-split #5 testing sets correctness" \
"$dataset"
rm "$res_dir"/*
+heri-split -r -d "$res_dir" -c 4 dataset1.txt
+val1=`cat $res_dir/test1.txt $res_dir/test2.txt $res_dir/test3.txt $res_dir/test4.txt`
+val2=`awk '{printf "%d %d свойство%d\n", $1, NR, NR}' $res_dir/testing_fold.txt |
+ sort -k1,1n -k2,2n |
+ awk '{print $2, $3}'`
+printf '%s' "$val1" | cmp "heri-split -r #6 correct testing_fold.txt" \
+ "$val2"
+
+rm "$res_dir"/*
heri-split -d "$res_dir" -c 4 dataset1.txt
val1=`cat $res_dir/test1.txt $res_dir/test2.txt $res_dir/test3.txt $res_dir/test4.txt`
-val2=`awk '{printf "%d %d features%d\n", $1, NR, NR}' $res_dir/testing_fold.txt |
+val2=`awk '{printf "%d %d свойство%d\n", $1, NR, NR}' $res_dir/testing_fold.txt |
sort -k1,1n -k2,2n |
awk '{print $2, $3}'`
printf '%s' "$val1" | cmp "heri-split #6 correct testing_fold.txt" \
@@ -74,7 +112,7 @@ heri-split -d "$res_dir" -c 9 dataset2.txt
for i in 1 2 3 4 5 6 7 8 9; do
wc -l "$res_dir/test$i.txt" | awk '{print $1}'
done |
-cmp "heri-split #7 correct stratified splitting" \
+cmp "heri-split #7 correct stratified sampling" \
'1
1
1
@@ -87,6 +125,18 @@ cmp "heri-split #7 correct stratified splitting" \
'
rm "$res_dir"/*
+heri-split -r -d "$res_dir" -c 4 dataset2.txt
+for i in 1 2 3 4; do
+ wc -l "$res_dir/test$i.txt" | awk '{print $1}'
+done |
+cmp "heri-split #7 correct random sampling" \
+ '3
+2
+2
+2
+'
+
+rm "$res_dir"/*
heri-split -d "$res_dir" -c 2 dataset3.txt
for j in 1 2; do
echo "dataset: $j"
@@ -95,7 +145,7 @@ for j in 1 2; do
END {printf("tag %s -> %s\n", tag, cnt)}' "$res_dir/test$j.txt"
done
done |
-cmp "heri-split #8 correct stratified splitting" \
+cmp "heri-split #8 correct stratified sampling" \
'dataset: 1
tag 1 -> 1
tag 2 -> 1
@@ -118,7 +168,7 @@ for i in 0 1 2 3 4 5 6 7 8 9; do
END {print cnt}' "$res_dir/train$j.txt"
done | sort | awk '{ma = $1} NR == 1 {mi = $1} END {print ((ma - mi) <= 1)}'
done |
-cmp "heri-split #9 correct stratified splitting" \
+cmp "heri-split #9 correct stratified sampling" \
'tag: 0
1
tag: 1
diff --git a/tests/test_heri-stat.sh b/tests/test_heri-stat.sh
index 78f3c6c..d32b438 100644
--- a/tests/test_heri-stat.sh
+++ b/tests/test_heri-stat.sh
@@ -122,10 +122,10 @@ Macro average f1 0.NNNN
heri-stat golden3.txt result3.txt 2>&1 |
remove_fractions |
cmp 'heri-stat #8 all equal' \
-'Class A P, R, F1: 1 0/0 , 0 0/6 , 0
+'Class A P, R, F1: 0 0/0 , 0 0/6 , 0
Class B P, R, F1: 0 0/6 , 0 0/0 , 0
Accuracy : 0 0/6
-Macro average P, R, F1: 0.NNNN , 0 , 0
+Macro average P, R, F1: 0 , 0 , 0
'
heri-stat golden3.txt /dev/null 2>&1 |
@@ -201,10 +201,10 @@ Macro average P, R, F1: 1 , 0.75 , 0.8439
heri-stat -t 0.93 golden5.txt result5_prob.txt 2>&1 |
cmp 'heri-stat #13.5 -t 0.93' \
-'Class A P, R, F1: 1 0/0 , 0 0/2 , 0
-Class B P, R, F1: 1 0/0 , 0 0/3 , 0
+'Class A P, R, F1: 0 0/0 , 0 0/2 , 0
+Class B P, R, F1: 0 0/0 , 0 0/3 , 0
Class C P, R, F1: 1 3/3 , 0.75 3/4 , 0.8571
Class E P, R, F1: 1 3/3 , 0.5 3/6 , 0.6667
Micro average P, R, F1: 1 6/6 , 0.4 6/15 , 0.5714
-Macro average P, R, F1: 1 , 0.3125 , 0.381
+Macro average P, R, F1: 0.5 , 0.3125 , 0.381
'