diff options
author | Andrew Shadura <andrewsh@debian.org> | 2018-03-27 09:00:23 +0200 |
---|---|---|
committer | Andrew Shadura <andrewsh@debian.org> | 2018-03-27 09:00:23 +0200 |
commit | 3a7a7cb2ec4965de7a7125ff3fbcb19a75847fcd (patch) | |
tree | 5ba8acf931b8d803b31e7d6b4ff9ddbc3dbbbabb | |
parent | dcbd9ccf5345ff85848df806a50ac4b2c08aaa5f (diff) | |
parent | 9d064c7bcd947481e225c9693463e28fa281e1c6 (diff) |
Merge tag 'herisvm-0.8.1' into dgit/sid
-rw-r--r-- | doc/NEWS | 8 | ||||
-rw-r--r-- | doc/TODO | 2 | ||||
-rwxr-xr-x | scripts/heri-eval | 81 | ||||
-rw-r--r-- | scripts/heri-eval.pod | 27 | ||||
-rwxr-xr-x | scripts/heri-split | 153 | ||||
-rw-r--r-- | scripts/heri-split.pod | 14 | ||||
-rwxr-xr-x | scripts/heri-stat | 5 | ||||
-rwxr-xr-x | scripts/heri-stat-addons | 2 | ||||
-rw-r--r-- | scripts/heri-stat.pod | 5 | ||||
-rw-r--r-- | tests/dataset1.txt | 18 | ||||
-rw-r--r-- | tests/test_heri-eval.sh | 19 | ||||
-rw-r--r-- | tests/test_heri-split.sh | 60 | ||||
-rw-r--r-- | tests/test_heri-stat.sh | 10 |
13 files changed, 285 insertions, 119 deletions
@@ -1,3 +1,9 @@ ====================================================================== +Version 0.8.1, Thu, 15 Mar 2018 15:28:46 +0300 + + heri-stat-addons: format string for max_dev was fixed + +====================================================================== Version 0.1.0, Sat, 13 Jun 2015 12:53:02 +0300 - initial publicly available release + + initial publicly available release @@ -1,4 +1,6 @@ * heri-eval: + - heri-eval -T: target class - Repeated random sub-sampling heri-eval -t 10 -r 60 ... - Alternative formats (crfsuite) for heri-split + - Support for IE (no classes, just information extraction) diff --git a/scripts/heri-eval b/scripts/heri-eval index be1ed8a..1a26193 100755 --- a/scripts/heri-eval +++ b/scripts/heri-eval @@ -1,7 +1,7 @@ #!/usr/bin/env bash # Copyright (c) 2015 Alexandra Figlovskaya <fglval@gmail.com> -# Copyright (c) 2015 Aleksey Cheusov <vle@gmx.net> +# Copyright (c) 2015-2017 Aleksey Cheusov <vle@gmx.net> # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the @@ -25,6 +25,11 @@ # variables settable by user : ${SVM_TRAIN_CMD:=svm-train} : ${SVM_PREDICT_CMD:=svm-predict} + +: ${SVM_HERI_STAT_CMD:=heri-stat} +: ${SVM_HERI_STAT_ADDONS_CMD:=heri-stat-addons} +: ${SVM_HERI_SPLIT_CMD:=heri-split} + : ${TMPDIR:=/tmp} ############################################################ @@ -52,6 +57,30 @@ on_exit(){ fi } +calculate_feature_count (){ + awk '{ + for (i=2; i <= NF; ++i) { + if ($i + 0 > m) + m = $i + 0 + } + } + END { + print m+1 + }' "$@" +} + +calculate_feature_count (){ + awk '{ + for (i=2; i <= NF; ++i) { + if ($i + 0 > m) + m = $i + 0 + } + } + END { + print m+1 + }' "$@" +} + results_from_testing_sets (){ if ! test -s "$tmp_dir/testing_fold.txt"; then cat "$tmp_dir/result_single1.txt" @@ -128,38 +157,37 @@ usage(){ usage: heri-eval [OPTIONS] training_set [-- SVM_TRAIN_OPTIONS] Examples: heri-eval -n5 dataset.libsvm # 5-fold cross-validation - heri-eval -e testing.libsvm dataset.libsvm # testing on testing.libsvm + heri-eval -e testing.libsvm training.libsvm # testing on testing.libsvm OPTIONS: - -h help message + -h Help message - -n N N-fold cross validation mode (mandatory option) - -t T T*N-fold cross validation mode (1 by default) + -n N The number of folds for T*N-fold cross-validation + -t T The number of runs for T*N-fold cross-validation - -e testing_set testing set for hold-out method + -e testing_set Testing set for hold-out - -o <filename> save results from testing sets + -o <filename> Save results from testing sets to the specified file (golden_tag result_tag [score]) - -O <filename> save incorrectly classified objects + -O <filename> Save incorrectly classified objects to the specified file (#object_number: golden_tag result_tag [score]) - -m <filename> save confusion matrix to the specified file + -m <filename> Save confusion matrix to the specified file (frequency : golden_tag result_tag) -f Enable output of per-fold statistics (see -Mf) - -M <chars> output mode: + -M <chars> Output mode: t -- output total statistics, f -- output per-fold statistics, c -- output cross-fold statistics. - The default is "-M tc". - -p <stat_opts> options passed to heri-stat(1) - -S <seed> seed pseudo-random generator used for splitting - dataset into traing and testing parts. - The default is empty, which means - 'split dataset randomly every invocation' - -K keep temporary directory after exiting - -D debugging mode, implies -K + -s <split_opts> Options passed to heri-split(1) + -p <stat_opts> Options passed to heri-stat(1) + -S <seed> Seed value passed to heri-split(1). + If it is not specified, the dataset is splitted + into training and testing datasets randomly. + -K Keep temporary directory after exiting + -D Debugging mode, implies -K SVM_TRAIN_OPTIONS: options passed to svm-train(1) and alike @@ -181,7 +209,7 @@ EOF runs=1 output_mode=tc times=1 -while getopts De:fhKm:M:n:o:O:p:S:t: f; do +while getopts De:fhKm:M:n:o:O:p:s:S:t: f; do case "$f" in '?') usage @@ -201,6 +229,8 @@ while getopts De:fhKm:M:n:o:O:p:S:t: f; do results="$OPTARG";; O) incorrect_results="$OPTARG";; + s) + herisplit_args="$herisplit_args $OPTARG";; p) heristat_args="$heristat_args $OPTARG";; f) @@ -247,7 +277,7 @@ tmp_dir=`mktemp -d $TMPDIR/svm.XXXXXX` training_testing (){ if test -n "$number_of_folds"; then - heri-split -c "$number_of_folds" -d "$tmp_dir" -s "$seed" $files + ${SVM_HERI_SPLIT_CMD} $herisplit_args -c "$number_of_folds" -d "$tmp_dir" -s "$seed" $files if test -n "$seed"; then seed="${seed}9876" fi @@ -287,12 +317,13 @@ show_stat (){ awk '{print $1}' "$tmp_dir/test${t}_$i.txt" > "$tmp_dir/golden_tags${t}_${i}" if [[ "_$output_mode" =~ f ]]; then echo "Fold ${t}x$i statistics" - heri-stat $heristat_args \ + ${SVM_HERI_STAT_CMD} $heristat_args \ "$tmp_dir/golden_tags${t}_${i}" "$tmp_dir/result${t}_${i}.txt" | indent2 echo '' fi - heri-stat -R "$tmp_dir/golden_tags${t}_${i}" "$tmp_dir/result${t}_${i}.txt" \ + ${SVM_HERI_STAT_CMD} -R \ + "$tmp_dir/golden_tags${t}_${i}" "$tmp_dir/result${t}_${i}.txt" \ > "$tmp_dir/evaluation${t}_${i}.txt" paste "$tmp_dir/golden_tags${t}_${i}" "$tmp_dir/result${t}_${i}.txt" | \ tr ' ' ' ' > "$tmp_dir/result_single${t}_${i}.txt" @@ -303,6 +334,8 @@ show_stat (){ done } +export HERISVM_FC=`calculate_feature_count $files` + for t in `seq $times`; do training_testing "$@" # ls -l "$tmp_dir/" @@ -341,11 +374,11 @@ fi # if [[ "_$output_mode" =~ t ]]; then echo 'Total statistics' - heri-stat -1 $heristat_args "$tmp_dir"/result_single*_*.txt | indent2 + ${SVM_HERI_STAT_CMD} -1 $heristat_args "$tmp_dir"/result_single*_*.txt | indent2 echo '' fi if test -n "$number_of_folds" && [[ "_$output_mode" =~ c ]]; then echo 'Total cross-folds statistics' - heri-stat-addons "$tmp_dir"/evaluation*.txt | indent2 + ${SVM_HERI_STAT_ADDONS_CMD} "$tmp_dir"/evaluation*.txt | indent2 fi diff --git a/scripts/heri-eval.pod b/scripts/heri-eval.pod index 46575f3..222e61c 100644 --- a/scripts/heri-eval.pod +++ b/scripts/heri-eval.pod @@ -28,15 +28,16 @@ Enable output of per-fold statistics. See B<-M>I<f>. =item B<-n> I<N> -I<N>-fold cross validation mode (mandatory option). +Enable T*I<N>-fold cross-validation mode and set the number of folds to I<N>. =item B<-t> I<T> -I<T>*N-fold cross validation mode (1 by default). +Enable I<T>*N-fold cross-validation mode and set the number of runs to I<T> +which 1 by default. -=item B<-e> I<testing set> +=item B<-e> I<testing_dataset> -Sets the testing dataset. +Enable hold-out mode and set the testing dataset. =item B<-o> I<filename> @@ -60,7 +61,11 @@ Format: frequency : golden_class result_class =item B<-p> I<opts> -Pass the specified I<opts> to B<heri-stat(1)> +Pass the specified I<opts> to B<heri-stat(1)>. + +=item B<-s> I<opts> + +Pass the specified I<opts> to B<heri-split(1)>. =item B<-M> I<chars> @@ -98,6 +103,18 @@ Training utility, e.g., liblinear-train Predicting utility, e.g., liblinear-predict (the default is svm-predict). +=item I<SVM_HERI_STAT_CMD> + +Utility for calculating statistics (the default is B<heri-stat(1)>). + +=item I<SVM_HERI_STAT_ADDONS_CMD> + +Utility for calculating additional statistics (the default is B<heri-stat-addons(1)>). + +=item I<SVM_HERI_SPLIT_CMD> + +Utility for splitting the dataset (the default is B<heri-split(1)>). + =item I<TMPDIR> Temporary directory (the default is /tmp). diff --git a/scripts/heri-split b/scripts/heri-split index 0c4381b..91b1c77 100755 --- a/scripts/heri-split +++ b/scripts/heri-split @@ -1,7 +1,7 @@ #!/usr/bin/env ruby # Copyright (c) 2015 Alexandra Figlovskaya <fglval@gmail.com> -# Copyright (c) 2015 Aleksey Cheusov <vle@gmx.net> +# Copyright (c) 2015-2017 Aleksey Cheusov <vle@gmx.net> # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the @@ -24,10 +24,11 @@ require 'optparse' -options = {} -fold_cnt = nil -tmp_dir = nil -seed = Random.new_seed +$options = {} +$fold_cnt = nil +$tmp_dir = nil +$seed = Random.new_seed +$stratified = true OptionParser.new do |opts| opts.banner = <<EOF @@ -44,94 +45,134 @@ EOF end opts.on("-cFOLD_CNT", "--folds=FOLD_CNT", "A number if folds (mandatory option)") do |c| - fold_cnt = c.to_i + $fold_cnt = c.to_i end opts.on("-dDIR", "--output-dir=DIR", "Output directory (mandatory option)") do |d| - tmp_dir = d + $tmp_dir = d end opts.on("-sSEED", "--seed=SEED", "Seed for pseudo-random number generator") do |s| if s != "" then - seed = s.to_i + $seed = s.to_i end end + opts.on("-r", "--random", "Use random split instead of stratified") do + $stratified = false + end + opts.separator " " end.parse! -if tmp_dir == nil or fold_cnt == nil then +if $tmp_dir == nil or $fold_cnt == nil then STDERR.puts("Options -c and -d are mandatory, see heri-split -h for details") exit(1) - end - -rnd = Random.new(seed) +end -############################################# -#### to create ranges +$rnd = Random.new($seed) -files_test = [] -files_train = [] -testing_fold = File.open(tmp_dir+"/testing_fold.txt", 'w') -(1..fold_cnt).each do |i| +# same as in StratifiedSplitter +$files_test = [] +$files_train = [] +$testing_fold = File.open($tmp_dir+"/testing_fold.txt", 'w:ASCII-8BIT') +(1..$fold_cnt).each do |i| name_train = "train" + "#{i.to_i}" name_test = "test" + "#{i.to_i}" - files_test << File.open(tmp_dir+"/"+name_test+".txt", 'w') - files_train << File.open(tmp_dir+ "/"+ name_train+".txt", 'w') + $files_test << File.open($tmp_dir+"/"+name_test+".txt", 'w:ASCII-8BIT') + $files_train << File.open($tmp_dir+ "/"+ name_train+".txt", 'w:ASCII-8BIT') end -############################################# -#### +def random_split() + nums = [] + curr_number = 0 + ARGV.each do |fn| + File.open(fn, "r:ASCII-8BIT").each_line do |line| + if line =~ /^([^\s]+)\s/ + nums << curr_number % $fold_cnt + curr_number += 1 + end + end + end + + nums.shuffle!(random: $rnd) + + curr_number = 0 + ARGV.each do |fn| + File.open(fn, "r:ASCII-8BIT").each_line do |line| + if line =~ /^([^\s]+)\s/ + fold_num = nums[curr_number] + $fold_cnt.times do |n| + if fold_num == n + $files_test[n].puts line + $testing_fold.puts n+1 + else + $files_train[n].puts line + end + end -classes = Hash.new(0) -ARGV.each do |fn| - File.open(fn).each_line do |line| - if line =~ /^([^\s]+)\s/ - classes[$1] += 1 + curr_number += 1 + end end end end -classes_arr = {} -classes.each do |x, y| - arr = [] - i = 1 - while i <= y - arr << i - i +=1 + +def stratified_split() + classes = Hash.new(0) + ARGV.each do |fn| + File.open(fn, "r:ASCII-8BIT").each_line do |line| + if line =~ /^([^\s]+)\s/ + classes[$1] += 1 + end + end end - cnt = (( y / fold_cnt.to_f ) ).to_i - arr = arr.shuffle(random: rnd) - classes_arr [x] = {} - arr.each_index do |i| - fold_train = (i * fold_cnt.to_f / arr.size).to_i - classes_arr[x][arr[i]] = fold_train + classes_arr = {} + classes.each do |x, y| + arr = [] + i = 1 + while i <= y + arr << i + i +=1 + end + cnt = (( y / $fold_cnt.to_f ) ).to_i + arr.shuffle!(random: $rnd) + classes_arr [x] = {} + arr.each_index do |i| + fold_train = (i * $fold_cnt.to_f / arr.size).to_i + classes_arr[x][arr[i]] = fold_train + end end -end -num_line = Hash.new(0) -ARGV.each do |fn| - File.open(fn).each_line do |line| - if line =~ /^([^\s]+)\s/ - num_line[$1] += 1 - curr_number = num_line[$1] - fold_cnt.times do |n| - if classes_arr[$1][curr_number] == n - files_test[classes_arr[$1][curr_number]].puts line - testing_fold.puts n+1 - else - files_train[n].puts line + num_line = Hash.new(0) + ARGV.each do |fn| + File.open(fn, "r:ASCII-8BIT").each_line do |line| + if line =~ /^([^\s]+)\s/ + num_line[$1] += 1 + curr_number = num_line[$1] + $fold_cnt.times do |n| + if classes_arr[$1][curr_number] == n + $files_test[n].puts line + $testing_fold.puts n+1 + else + $files_train[n].puts line + end end end end end end +if $stratified + stratified_split() +else + random_split() +end -files_test.each { |x| +$files_test.each { |x| x.close } -files_train.each { |x| +$files_train.each { |x| x.close } -testing_fold.close +$testing_fold.close diff --git a/scripts/heri-split.pod b/scripts/heri-split.pod index f213f6b..b7e937e 100644 --- a/scripts/heri-split.pod +++ b/scripts/heri-split.pod @@ -10,7 +10,9 @@ B<heri-split> [OPTIONS] I<dataset1> [I<dataset2>...] B<heri-split> splits the dataset into several training and testing sets as it is required for N-fold cross-validation. Dataset contains -one object per line as in svmlight/libsvm formats. +one object per line as in svmlight format. By default +stratified sampling is used. That is, all folds contain +the same number of objects for each label. =head1 OPTIONS @@ -22,15 +24,19 @@ Display help information. =item B<-c, --folds> I<count> -Sets a number if folds. This is a mandatory option. +Set the number of folds. This is a mandatory option. =item B<-d, --output-dir> I<dir> -Sets the output directory. This is a mandatory option. +Set the output directory. This is a mandatory option. + +=item B<-r,--random> + +Use random sampling instead of stratified one. =item B<-s, --seed> I<seed> -Sets the seed value for pseudorandom generator. +Set the seed value for pseudorandom generator. =back diff --git a/scripts/heri-stat b/scripts/heri-stat index ebaf700..ff7f819 100755 --- a/scripts/heri-stat +++ b/scripts/heri-stat @@ -68,7 +68,7 @@ def pretty_div(a, b) end def normalize_tag(tag) - tag = tag.sub(/^[+]/, "") # +1 => 1 + tag = tag.to_s.sub(/^[+]/, "") # +1 => 1 if tag =~ /^-?[0-9]+[.][0-9]+$/ tag = tag.sub(/[.]0+$/, "") # -1.0000 => -1 end @@ -87,6 +87,7 @@ def split_into_3(line, fn) when 3 ret = [normalize_tag(tokens[0]), normalize_tag(tokens[1]), tokens[2].to_f] else + ret = [normalize_tag(tokens[0]), normalize_tag(tokens[1]), Float::MAX] line.sub!(/^fake ?/, "") STDERR.puts("Bad line '#{line}' in file '#{fn}'") @err = 1 @@ -215,7 +216,7 @@ all_tp = 0 all_f1 = 0 res_tag2TP_cnt = tag2TP_cnt.sort_by { |key, value| key } res_tag2TP_cnt.each do |t, tp| - p = (tag2result_cnt[t] > 0.0 ? tp.to_f / tag2result_cnt[t] : 1.0) + p = (tag2result_cnt[t] > 0.0 ? tp.to_f / tag2result_cnt[t] : 0.0) r = (tag2golden_cnt[t] > 0.0 ? tp.to_f / tag2golden_cnt[t] : 0.0) f1 = (p+r > 0.0 ? 2*p*r / (p+r) : 0.0) if !@options[:statistics] diff --git a/scripts/heri-stat-addons b/scripts/heri-stat-addons index 23e1617..6220931 100755 --- a/scripts/heri-stat-addons +++ b/scripts/heri-stat-addons @@ -120,7 +120,7 @@ FIELDS.each do |f, f_to_print| pairs.each do |ft| max_dev = max_deviation [ft[0]][ft[1]] std_dev = std_deviation [ft[0]][ft[1]] - max_dev = "%-5.3g%" % [max_dev*100] + max_dev = "%-5.3g" % [max_dev*100] std_dev = "%-5.3g" % [std_dev*100] print_value(ft[1], "max/std deviation(" + f_to_print + ")", max_dev, std_dev) end diff --git a/scripts/heri-stat.pod b/scripts/heri-stat.pod index 061784a..6f0e77b 100644 --- a/scripts/heri-stat.pod +++ b/scripts/heri-stat.pod @@ -1,7 +1,6 @@ =head1 NAME -heri-stat - calculates precision, recall, F1 -and some other things for given golden data and predictions. +heri-stat - calculates precision, recall, F1 and some other things =head1 SYNOPSIS @@ -51,7 +50,7 @@ Disable output of accuracy. =item B<-1, --single> -2 or 3 tokens per line are expected on input +2 or 3 tokens per line are expected on input. =item B<-u, --unclassified> I<label> diff --git a/tests/dataset1.txt b/tests/dataset1.txt index 76b8e4a..7dd27d0 100644 --- a/tests/dataset1.txt +++ b/tests/dataset1.txt @@ -1,9 +1,9 @@ -1 features1 -2 features2 -3 features3 -4 features4 -5 features5 -6 features6 -7 features7 -8 features8 -9 features9 +1 свойство1 +2 свойство2 +3 свойство3 +4 свойство4 +5 свойство5 +6 свойство6 +7 свойство7 +8 свойство8 +9 свойство9 diff --git a/tests/test_heri-eval.sh b/tests/test_heri-eval.sh index c59d564..d1a1df1 100644 --- a/tests/test_heri-eval.sh +++ b/tests/test_heri-eval.sh @@ -292,9 +292,9 @@ env SVM_TRAIN_CMD=test_train SVM_PREDICT_CMD=test_predict \ cmp 'heri-eval #10.1 -- options' \ 'Total statistics Class 0 P, R, F1: 0.5797 40/69 , 1 40/40 , 0.7339 - Class 1 P, R, F1: 1 0/0 , 0 0/29 , 0 + Class 1 P, R, F1: 0 0/0 , 0 0/29 , 0 Accuracy : 0.5797 40/69 - Macro average P, R, F1: 0.7899 , 0.5 , 0.367 + Macro average P, R, F1: 0.2899 , 0.5 , 0.367 ' @@ -302,9 +302,20 @@ env SVM_TRAIN_CMD=test_train SVM_PREDICT_CMD=test_predict \ heri-eval -Mt -n2 matrix.libsvm -- -1 2>&1 | cmp 'heri-eval #10.2 -- options' \ 'Total statistics - Class 0 P, R, F1: 1 0/0 , 0 0/40 , 0 + Class 0 P, R, F1: 0 0/0 , 0 0/40 , 0 Class 1 P, R, F1: 0.4203 29/69 , 1 29/29 , 0.5918 Accuracy : 0.4203 29/69 - Macro average P, R, F1: 0.7101 , 0.5 , 0.2959 + Macro average P, R, F1: 0.2101 , 0.5 , 0.2959 + +' + +env SVM_TRAIN_CMD=test_train SVM_PREDICT_CMD=test_predict \ + heri-eval -Mt -s '-r' -S117 -n2 matrix.libsvm -- -1 2>&1 | +cmp 'heri-eval #10.3 -- options' \ +'Total statistics + Class 0 P, R, F1: 0 0/0 , 0 0/40 , 0 + Class 1 P, R, F1: 0.4203 29/69 , 1 29/29 , 0.5918 + Accuracy : 0.4203 29/69 + Macro average P, R, F1: 0.2101 , 0.5 , 0.2959 ' diff --git a/tests/test_heri-split.sh b/tests/test_heri-split.sh index 9d96560..0caf09b 100644 --- a/tests/test_heri-split.sh +++ b/tests/test_heri-split.sh @@ -1,4 +1,4 @@ -# heri-split -- no tests yet +# -*- coding: utf-8 -*- dataset="$tmpdir/dataset" res_dir="$tmpdir/dir1" @@ -34,6 +34,35 @@ exit status=1 ' generate_random_dataset + +rm -rf "$res_dir"/* + +{ heri-split -rc2 -d "$res_dir" "$dataset" 2>&1; echo "exit status=$?"; } | +cmp 'heri-split -r #2 exit code' \ +'exit status=0 +' + +ls -1 "$res_dir" | sort | +cmp 'heri-split -r #3 result files' \ +'test1.txt +test2.txt +testing_fold.txt +train1.txt +train2.txt +' + +for i in 1 2; do + { cat "$res_dir/test${i}.txt" "$res_dir/train${i}.txt" | sort -k3,3n; } | + cmp2 "heri-split -r #4.${i} all objects" \ + "$dataset" +done + +{ cat "$res_dir/"test?.txt | sort -k3,3n; } | +cmp2 "heri-split -r #5 testing sets correctness" \ + "$dataset" + +rm -rf "$res_dir"/* + { heri-split -c 3 -d "$res_dir" "$dataset" 2>&1; echo "exit status=$?"; } | cmp 'heri-split #2 exit code' \ 'exit status=0 @@ -61,9 +90,18 @@ cmp2 "heri-split #5 testing sets correctness" \ "$dataset" rm "$res_dir"/* +heri-split -r -d "$res_dir" -c 4 dataset1.txt +val1=`cat $res_dir/test1.txt $res_dir/test2.txt $res_dir/test3.txt $res_dir/test4.txt` +val2=`awk '{printf "%d %d свойство%d\n", $1, NR, NR}' $res_dir/testing_fold.txt | + sort -k1,1n -k2,2n | + awk '{print $2, $3}'` +printf '%s' "$val1" | cmp "heri-split -r #6 correct testing_fold.txt" \ + "$val2" + +rm "$res_dir"/* heri-split -d "$res_dir" -c 4 dataset1.txt val1=`cat $res_dir/test1.txt $res_dir/test2.txt $res_dir/test3.txt $res_dir/test4.txt` -val2=`awk '{printf "%d %d features%d\n", $1, NR, NR}' $res_dir/testing_fold.txt | +val2=`awk '{printf "%d %d свойство%d\n", $1, NR, NR}' $res_dir/testing_fold.txt | sort -k1,1n -k2,2n | awk '{print $2, $3}'` printf '%s' "$val1" | cmp "heri-split #6 correct testing_fold.txt" \ @@ -74,7 +112,7 @@ heri-split -d "$res_dir" -c 9 dataset2.txt for i in 1 2 3 4 5 6 7 8 9; do wc -l "$res_dir/test$i.txt" | awk '{print $1}' done | -cmp "heri-split #7 correct stratified splitting" \ +cmp "heri-split #7 correct stratified sampling" \ '1 1 1 @@ -87,6 +125,18 @@ cmp "heri-split #7 correct stratified splitting" \ ' rm "$res_dir"/* +heri-split -r -d "$res_dir" -c 4 dataset2.txt +for i in 1 2 3 4; do + wc -l "$res_dir/test$i.txt" | awk '{print $1}' +done | +cmp "heri-split #7 correct random sampling" \ + '3 +2 +2 +2 +' + +rm "$res_dir"/* heri-split -d "$res_dir" -c 2 dataset3.txt for j in 1 2; do echo "dataset: $j" @@ -95,7 +145,7 @@ for j in 1 2; do END {printf("tag %s -> %s\n", tag, cnt)}' "$res_dir/test$j.txt" done done | -cmp "heri-split #8 correct stratified splitting" \ +cmp "heri-split #8 correct stratified sampling" \ 'dataset: 1 tag 1 -> 1 tag 2 -> 1 @@ -118,7 +168,7 @@ for i in 0 1 2 3 4 5 6 7 8 9; do END {print cnt}' "$res_dir/train$j.txt" done | sort | awk '{ma = $1} NR == 1 {mi = $1} END {print ((ma - mi) <= 1)}' done | -cmp "heri-split #9 correct stratified splitting" \ +cmp "heri-split #9 correct stratified sampling" \ 'tag: 0 1 tag: 1 diff --git a/tests/test_heri-stat.sh b/tests/test_heri-stat.sh index 78f3c6c..d32b438 100644 --- a/tests/test_heri-stat.sh +++ b/tests/test_heri-stat.sh @@ -122,10 +122,10 @@ Macro average f1 0.NNNN heri-stat golden3.txt result3.txt 2>&1 | remove_fractions | cmp 'heri-stat #8 all equal' \ -'Class A P, R, F1: 1 0/0 , 0 0/6 , 0 +'Class A P, R, F1: 0 0/0 , 0 0/6 , 0 Class B P, R, F1: 0 0/6 , 0 0/0 , 0 Accuracy : 0 0/6 -Macro average P, R, F1: 0.NNNN , 0 , 0 +Macro average P, R, F1: 0 , 0 , 0 ' heri-stat golden3.txt /dev/null 2>&1 | @@ -201,10 +201,10 @@ Macro average P, R, F1: 1 , 0.75 , 0.8439 heri-stat -t 0.93 golden5.txt result5_prob.txt 2>&1 | cmp 'heri-stat #13.5 -t 0.93' \ -'Class A P, R, F1: 1 0/0 , 0 0/2 , 0 -Class B P, R, F1: 1 0/0 , 0 0/3 , 0 +'Class A P, R, F1: 0 0/0 , 0 0/2 , 0 +Class B P, R, F1: 0 0/0 , 0 0/3 , 0 Class C P, R, F1: 1 3/3 , 0.75 3/4 , 0.8571 Class E P, R, F1: 1 3/3 , 0.5 3/6 , 0.6667 Micro average P, R, F1: 1 6/6 , 0.4 6/15 , 0.5714 -Macro average P, R, F1: 1 , 0.3125 , 0.381 +Macro average P, R, F1: 0.5 , 0.3125 , 0.381 ' |