Merge pull request VowpalWabbit#36 from hal3/master

new version of ldf functionality, many bugs fixed
alfa07 · Jul 9, 2012 · bdb4571 · bdb4571
2 parents 258c9d9 + 2cff834
commit bdb4571
Show file tree

Hide file tree

Showing 36 changed files with 1,242 additions and 573 deletions.
diff --git a/Makefile b/Makefile
@@ -30,7 +30,10 @@ FLAGS = $(ARCH) $(WARN_FLAGS) $(OPTIM_FLAGS) -D_FILE_OFFSET_BITS=64 -I $(BOOST_I
 #FLAGS = -Wall $(ARCH) -ffast-math -D_FILE_OFFSET_BITS=64 -I $(BOOST_INCLUDE) -pg -g
 
 # for valgrind
-#FLAGS = -Wall $(ARCH) -ffast-math -D_FILE_OFFSET_BITS=64 -I $(BOOST_INCLUDE) -g -O0
+FLAGS = -Wall $(ARCH) -ffast-math -D_FILE_OFFSET_BITS=64 -I $(BOOST_INCLUDE) -g -O0
+
+# for valgrind profiling: run 'valgrind --tool=callgrind PROGRAM' then 'callgrind_annotate --tree=both --inclusive=yes'
+#FLAGS = -Wall $(ARCH) -ffast-math -D_FILE_OFFSET_BITS=64 -I $(BOOST_INCLUDE) -g -O3 -fomit-frame-pointer -ffast-math -fno-strict-aliasing
 
 BINARIES = vw active_interactor
 MANPAGES = vw.1

diff --git a/library/Makefile b/library/Makefile
@@ -1,2 +1,5 @@
+ezexample: temp2.cc ../vowpalwabbit/libvw.a
+	g++ -g -o $@ -l boost_program_options -l z -l pthread $< -L ../vowpalwabbit -l vw -l allreduce
+
 library_example: library_example.cc ../vowpalwabbit/libvw.a
 	g++ -g -o $@ $< -L ../vowpalwabbit -l vw -l allreduce -l boost_program_options -l z -l pthread
diff --git a/library/ezexample.h b/library/ezexample.h
@@ -0,0 +1,131 @@
+#ifndef EZEXAMPLE_H
+#define EZEXAMPLE_H
+
+#include <stdio.h>
+#include "../vowpalwabbit/vw.h"
+
+using namespace std;
+typedef uint32_t fid;
+
+struct vw_namespace {
+  char namespace_letter;
+public: vw_namespace(const char c) : namespace_letter(c) {}
+};
+
+class ezexample {
+private:
+  vw*vw_ref;
+  vector<VW::feature_space> *dat;
+  vector<fid> past_seeds;
+  fid current_seed;
+  vector<feature>*current_ns;
+  char str[2];
+  bool pass_empty;
+  string mylabel;
+  ezexample(const ezexample & ex);
+  ezexample & operator=(const ezexample & ex);
+
+public:
+
+  ezexample(vw*this_vw, bool pe=false) { 
+    dat = new vector<VW::feature_space>();
+    vw_ref = this_vw;
+    current_seed = 0;
+    current_ns = NULL;
+    str[0] = ' '; str[1] = 0;
+    pass_empty = pe;
+    mylabel = "";
+  }
+
+  ~ezexample() {
+    if (dat != NULL)
+      delete dat;
+  }
+
+  void addns(char c) {
+    str[0] = c;
+    dat->push_back( VW::feature_space(c, vector<feature>()) );
+    current_ns = &( dat->at(dat->size()-1).second );
+    past_seeds.push_back(current_seed);
+    current_seed = VW::hash_space(*vw_ref, str);
+  }
+
+  void remns() { 
+    if (dat->size() == 0) {
+      current_seed = 0;
+      current_ns   = NULL;
+    } else {
+      current_seed = past_seeds.back();
+      past_seeds.pop_back();
+      dat->pop_back();
+      current_ns = &(dat->back().second);
+    }
+  }
+
+  inline fid hash(string fstr) { 
+    return VW::hash_feature(*vw_ref, fstr, current_seed); 
+  }
+  inline fid hash(char* fstr) { 
+    return VW::hash_feature_cstr(*vw_ref, fstr, current_seed);
+  }
+  inline fid hash(char c, string fstr) { 
+    str[0] = c;
+    return VW::hash_feature(*vw_ref, fstr, VW::hash_space(*vw_ref, str)); 
+  }
+  inline fid hash(char c, char* fstr) { 
+    str[0] = c;
+    return VW::hash_feature_cstr(*vw_ref, fstr, VW::hash_space(*vw_ref, str)); 
+  }
+
+  inline fid addf(fid fint, float val) {
+    if (!current_ns) return 0;
+    feature f = { val, fint };
+    current_ns->push_back(f);
+    return fint;
+  }
+  inline fid addf(fid    fint           ) { return addf(fint      , 1.0); }
+  inline fid addf(string fstr, float val) { return addf(hash(fstr), val); }
+  inline fid addf(string fstr           ) { return addf(hash(fstr), 1.0); }
+
+  float predict() {
+    static example* empty_example = VW::read_example(*vw_ref, (char*)"| ");
+    example *ec = VW::import_example(*vw_ref, *dat);
+
+    if (mylabel.length() > 0)
+      VW::parse_example_label(*vw_ref, *ec, mylabel);
+
+    vw_ref->learn(vw_ref, ec);
+    if (pass_empty)
+      vw_ref->learn(vw_ref, empty_example);
+    float pred = ec->final_prediction;
+    VW::finish_example(*vw_ref, ec);
+    return pred;
+  }
+
+  inline ezexample& set_label(string label) { mylabel = label; return *this; }
+  inline ezexample& operator()(fid         fint           ) { addf(fint, 1.0); return *this; }
+  inline ezexample& operator()(string      fstr           ) { addf(fstr, 1.0); return *this; }
+  inline ezexample& operator()(const char* fstr           ) { addf(fstr, 1.0); return *this; }
+  inline ezexample& operator()(fid         fint, float val) { addf(fint, val); return *this; }
+  inline ezexample& operator()(string      fstr, float val) { addf(fstr, val); return *this; }
+  inline ezexample& operator()(const char* fstr, float val) { addf(fstr, val); return *this; }
+  inline ezexample& operator()(const vw_namespace&n) { addns(n.namespace_letter); return *this; }
+  inline ezexample& operator--() { remns(); return *this; }
+  inline float      operator()() { return predict(); }
+
+
+  void print() {
+    cerr << "ezexample dat->size=" << dat->size() << ", current_seed=" << current_seed << endl;
+    for (size_t i=0; i<dat->size(); i++) {
+      cerr << "  namespace(" << dat->at(i).first << "):" << endl;
+      for (size_t j=0; j<dat->at(i).second.size(); j++) {
+        cerr << "    " << dat->at(i).second[j].weight_index << "\t: " << dat->at(i).second[j].x << endl;
+      }
+    }
+  }
+};
+
+
+
+
+#endif
diff --git a/library/library_example.cc b/library/library_example.cc
@@ -13,9 +13,9 @@ inline feature vw_feature_from_string(vw& v, string fstr, unsigned long seed, fl
 
 int main(int argc, char *argv[])
 {
-  vw vw = VW::initialize("--hash all -q st --noconstant");
+  vw vw = VW::initialize("--hash all -q st --noconstant -i train.w");
 
-  example *vec2 = VW::read_example(vw, "|s p^the_man w^the w^man |t p^le_homme w^le w^homme");
+  example *vec2 = VW::read_example(vw, "|s p^the_man w^the w^man |t p^un_homme w^un w^homme");
   vw.learn(&vw, vec2);
   cerr << "p2 = " << vec2->final_prediction << endl;
   VW::finish_example(vw, vec2);

diff --git a/library/temp2.cc b/library/temp2.cc
@@ -0,0 +1,75 @@
+#include <stdio.h>
+#include "../vowpalwabbit/vw.h"
+#include "ezexample.h"
+
+using namespace std;
+
+inline feature vw_feature_from_string(vw& v, string fstr, unsigned long seed, float val)
+{
+  uint32_t foo = VW::hash_feature(v, fstr, seed);
+  feature f = { val, foo};
+  return f;
+}
+
+int main(int argc, char *argv[])
+{
+  // INITIALIZE WITH WHATEVER YOU WOULD PUT ON THE VW COMMAND LINE -- THIS READS IN A MODEL FROM train.w
+  vw vw = VW::initialize("--hash all -q st --noconstant -i train.w -t --quiet");
+
+  // HAL'S SPIFFY INTERFACE USING C++ CRAZINESS
+  ezexample ex(&vw, false);
+  ex(vw_namespace('s'))
+    ("p^the_man")
+    ("w^the")
+    ("w^man")
+    (vw_namespace('t'))
+    ("p^le_homme")
+    ("w^le")
+    ("w^homme");
+  cerr << "should be near zero = " << ex() << endl;
+
+  --ex;   // remove the most recent namespace
+  ex(vw_namespace('t'))
+    ("p^un_homme")
+    ("w^un")
+    ("w^homme");
+  cerr << "should be near one  = " << ex() << endl;
+
+  // AND FINISH UP
+  vw.finish(&vw);
+}
+
+  /*
+
+  */
+
+  /*
+  // JOHN'S CLUNKY INTERFACE USING STRINGS
+  example *vec1 = VW::read_example(vw, (char*)"|s p^the_man w^the w^man |t p^un_homme w^un w^homme");
+  vw.learn(&vw, vec1);
+  cerr << "p1 = " << vec1->final_prediction << endl;
+  VW::finish_example(vw, vec1);
+
+  example *vec2 = VW::read_example(vw, (char*)"|s p^the_man w^the w^man |t p^le_homme w^le w^homme");
+  vw.learn(&vw, vec2);
+  cerr << "p2 = " << vec2->final_prediction << endl;
+  VW::finish_example(vw, vec2);
+
+  // JOHN'S CLUNKY INTERFACE USING VECTORS
+  vector< VW::feature_space > ec_info;
+  vector<feature> s_features, t_features;
+  uint32_t s_hash = VW::hash_space(vw, "s");
+  uint32_t t_hash = VW::hash_space(vw, "t");
+  s_features.push_back( vw_feature_from_string(vw, "p^the_man", s_hash, 1.0) );
+  s_features.push_back( vw_feature_from_string(vw, "w^the", s_hash, 1.0) );
+  s_features.push_back( vw_feature_from_string(vw, "w^man", s_hash, 1.0) );
+  t_features.push_back( vw_feature_from_string(vw, "p^le_homme", t_hash, 1.0) );
+  t_features.push_back( vw_feature_from_string(vw, "w^le", t_hash, 1.0) );
+  t_features.push_back( vw_feature_from_string(vw, "w^homme", t_hash, 1.0) );
+  ec_info.push_back( VW::feature_space('s', s_features) );
+  ec_info.push_back( VW::feature_space('t', t_features) );
+  example* vec3 = VW::import_example(vw, ec_info);
+  vw.learn(&vw, vec3);
+  cerr << "p3 = " << vec3->final_prediction << endl;
+  VW::finish_example(vw, vec3);
+*/
diff --git a/library/test.cc b/library/test.cc
@@ -0,0 +1,12 @@
+#include <iostream>
+#include <vector>
+
+using namespace std;
+
+int main(int argc, char**argv) {
+  vector< pair< char, vector<int> > > u = vector< pair< char, vector<int> > >();
+  u.push_back( pair< char, vector<int> >('a', vector<int>()) );
+  vector<int>*v = &(u[0].second);
+  v->push_back(0);
+  cout << "i want this to say one: " << u[0].second.size() << endl;
+}
diff --git a/library/train b/library/train
@@ -0,0 +1,4 @@
+1 |s p^the_man w^the w^man |t p^un_homme w^un w^homme
+0 |s p^the_man w^the w^man |t p^le_homme w^le w^homme
+0 |s p^a_man w^a w^man |t p^un_homme w^un w^homme
+1 |s p^a_man w^a w^man |t p^le_homme w^le w^homme
diff --git a/library/train.sh b/library/train.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+rm -f train.cache train.w
+../vowpalwabbit/vw -c -d train -f train.w -q st --passes 100 --hash all --noconstant
+../vowpalwabbit/vw -t -d train -i train.w -p train.pred --noconstant
+
diff --git a/library/train.w b/library/train.w
diff --git a/test/RunTests b/test/RunTests
@@ -388,102 +388,102 @@ run_tests();
 
 __DATA__
 # Test 1:
-{VW} -b 17 -l 20 --initial_t 128000 --power_t 1 -d train-sets/0001.dat -f models/0001.model -c --passes 2 --compressed --ngram 3 --skips 1 && rm -f train-sets/*.cache
+{VW} -k -b 17 -l 20 --initial_t 128000 --power_t 1 -d train-sets/0001.dat -f models/0001.model -c --passes 2 --compressed --ngram 3 --skips 1
     train-sets/ref/0001.stdout
     train-sets/ref/0001.stderr
 
 # Test 2: checking predictions as well
-{VW} -t train-sets/0001.dat -i models/0001.model -p 001.predict.tmp && rm -f train-sets/*.cache
+{VW} -k -t train-sets/0001.dat -i models/0001.model -p 001.predict.tmp
     test-sets/ref/0001.stdout
     test-sets/ref/0001.stderr
     pred-sets/ref/0001.predict
 
 # Test 3: without -d, training only
-{VW} train-sets/0002.dat    -f models/0002.model && rm -f train-sets/*.cache
+{VW} -k train-sets/0002.dat    -f models/0002.model
     train-sets/ref/0002.stdout
     train-sets/ref/0002.stderr
 
 # Test 4: same, with -d
-{VW} -d train-sets/0002.dat    -f models/0002.model && rm -f train-sets/*.cache
+{VW} -k -d train-sets/0002.dat    -f models/0002.model
     train-sets/ref/0002.stdout
     train-sets/ref/0002.stderr
 
 # Test 5: add -q .., adaptive, and more (same input, different outputs)
-{VW} --initial_t 1 --power_t 0 --adaptive -q Tf -q ff -f models/0002a.model train-sets/0002.dat && rm -f train-sets/*.cache
+{VW} -k --initial_t 1 --power_t 0 --adaptive -q Tf -q ff -f models/0002a.model train-sets/0002.dat
     train-sets/ref/0002a.stdout
     train-sets/ref/0002a.stderr
 
 # Test 6: run predictions on Test 4 model
 # Pretending the labels aren't there
-{VW} -t -i models/0002.model -d train-sets/0002.dat -p 0002b.predict && rm -f train-sets/*.cache
+{VW} -k -t -i models/0002.model -d train-sets/0002.dat -p 0002b.predict
     test-sets/ref/0002b.stdout
     test-sets/ref/0002b.stderr
     pred-sets/ref/0002b.predict
 
 # Test 7: using -q and multiple threads
-{VW} --power_t 0 --adaptive -q ff -f models/0002c.model train-sets/0002.dat && rm -f train-sets/*.cache
+{VW} -k --power_t 0 --adaptive -q ff -f models/0002c.model train-sets/0002.dat
     train-sets/ref/0002c.stdout
     train-sets/ref/0002c.stderr
 
 # Test 8: predicts on test 7 model
-{VW} -t -i models/0002c.model -d train-sets/0002.dat -p 0002c.predict && rm -f train-sets/*.cache
+{VW} -k -t -i models/0002c.model -d train-sets/0002.dat -p 0002c.predict
     test-sets/ref/0002c.stdout
     test-sets/ref/0002c.stderr
     pred-sets/ref/0002c.predict
 
-# Test 9: label-dependent features with csoaa
-{VW} -c -d train-sets/cs_test.ldf -p cs_test.ldf.csoaa.predict --passes 10 --csoaa_ldf && rm -f train-sets/*.cache
+# Test 9: label-dependent features with csoaa_ldf
+{VW} -k -c -d train-sets/cs_test.ldf -p cs_test.ldf.csoaa.predict --passes 10 --csoaa_ldf multiline
     train-sets/ref/cs_test.ldf.csoaa.stdout
     train-sets/ref/cs_test.ldf.csoaa.stderr
     train-sets/ref/cs_test.ldf.csoaa.predict
 
-# Test 10: label-dependent features with wap
-{VW} -c -d train-sets/cs_test.ldf -p cs_test.ldf.wap.predict --passes 10 --wap_ldf && rm -f train-sets/*.cache
+# Test 10: label-dependent features with wap_ldf
+{VW} -k -c -d train-sets/cs_test.ldf -p cs_test.ldf.wap.predict --passes 10 --wap_ldf multiline
     train-sets/ref/cs_test.ldf.wap.stdout
     train-sets/ref/cs_test.ldf.wap.stderr
     train-sets/ref/cs_test.ldf.wap.predict
 
 # Test 11: Run sequence on seq_small for 12 passes, 4 passes per policy
-{VW} -c -d train-sets/seq_small --passes 12 --sequence_passes_per_policy 4 --sequence 4 && rm -f train-sets/*.cache
+{VW} -k -c -d train-sets/seq_small --passes 12 --sequence_passes_per_policy 4 --sequence 4
     train-sets/ref/seq_small.stdout
     train-sets/ref/seq_small.stderr
 
 # Test 12: Run sequence (wap) on wsj_small for 2 passes, 1 pass per policy, extra features
-{VW} -c -d train-sets/wsj_small.dat.gz --passes 2 --sequence_passes_per_policy 1 --sequence 45 --wap 45 --sequence_history 2 --sequence_bigrams --sequence_features 1 && rm -f train-sets/*.cache
+{VW} -k -c -d train-sets/wsj_small.dat.gz --passes 2 --sequence_passes_per_policy 1 --sequence 45 --wap 45 --sequence_history 2 --sequence_bigrams --sequence_features 1
     train-sets/ref/wsj_small.dat.stdout
     train-sets/ref/wsj_small.dat.stderr
 
 # Test 13: Run sequence (wap) on wsj_small for 1 passes, current policy, limited transitions
-{VW} -c -d train-sets/wsj_small.dat.gz --passes 1 --sequence_allow_current_policy --sequence 45 --wap 45 --sequence_transition_file train-sets/wsj.train.tm2 && rm -f train-sets/*.cache
+{VW} -k -c -d train-sets/wsj_small.dat.gz --passes 1 --sequence_allow_current_policy --sequence 45 --wap 45 --sequence_transition_file train-sets/wsj.train.tm2
     train-sets/ref/wsj_small-tm.dat.stdout
     train-sets/ref/wsj_small-tm.dat.stderr
 
 # Test 14: Run searn on seq_small for 12 passes, 4 passes per policy
-{VW} -c -d train-sets/seq_small --passes 12 --searn_passes_per_policy 4 --searn 4 --searn_task sequence && rm -f train-sets/*.cache
+{VW} -k -c -d train-sets/seq_small --passes 12 --searn_passes_per_policy 4 --searn 4 --searn_task sequence
     train-sets/ref/searn_small.stdout
     train-sets/ref/searn_small.stderr
 
 # Test 15: Run searn on wsj_small for 12 passes, 4 passes per policy, extra features
-{VW} -c -d train-sets/wsj_small.dat.gz --passes 12 --searn_passes_per_policy 4 --searn_task sequence --searn 45 --searn_sequencetask_history 2 --searn_sequencetask_bigrams --searn_sequencetask_features 1 && rm -f train-sets/*.cache
+{VW} -k -c -d train-sets/wsj_small.dat.gz --passes 12 --searn_passes_per_policy 4 --searn_task sequence --searn 45 --searn_sequencetask_history 2 --searn_sequencetask_bigrams --searn_sequencetask_features 1
     train-sets/ref/searn_wsj.stdout
     train-sets/ref/searn_wsj.stderr
 
 # Test 16: Run searn (wap) on wsj_small for 2 passes, 1 pass per policy, extra features
-{VW} -c -d train-sets/wsj_small.dat.gz --passes 2 --searn_passes_per_policy 1 --searn_task sequence --searn 45 --wap 45 --searn_sequencetask_history 2 --searn_sequencetask_bigrams --searn_sequencetask_features 1 && rm -f train-sets/*.cache
+{VW} -k -c -d train-sets/wsj_small.dat.gz --passes 2 --searn_passes_per_policy 1 --searn_task sequence --searn 45 --wap 45 --searn_sequencetask_history 2 --searn_sequencetask_bigrams --searn_sequencetask_features 1
     train-sets/ref/searn_wsj2.dat.stdout
     train-sets/ref/searn_wsj2.dat.stderr
 
 # Test 17: LBFGS on zero derivative input
-{VW} -c -d train-sets/zero.dat --loss_function=squared -b 20 --bfgs --mem 7 --passes 5 --l2 1.0 && rm -f train-sets/*.cache
+{VW} -k -c -d train-sets/zero.dat --loss_function=squared -b 20 --bfgs --mem 7 --passes 5 --l2 1.0
     train-sets/ref/zero.stdout
     train-sets/ref/zero.stderr
 
 # Test 18: LBFGS early termination
-{VW} -c -d train-sets/rcv1_small.dat --loss_function=logistic -b 20 --bfgs --mem 7 --passes 20 --termination 0.001 --l2 1.0 && rm -f train-sets/*.cache
+{VW} -k -c -d train-sets/rcv1_small.dat --loss_function=logistic -b 20 --bfgs --mem 7 --passes 20 --termination 0.001 --l2 1.0
     train-sets/ref/rcv1_small.stdout
     train-sets/ref/rcv1_small.stderr
 
 # Test 19: Run LDA with 100 topics on 1000 Wikipedia articles
-{LDA} --lda 100 --lda_alpha 0.01 --lda_rho 0.01 --lda_D 1000 -b 13 --minibatch 128 train-sets/wiki1K.dat && rm -f train-sets/*.cache
+{LDA} -k --lda 100 --lda_alpha 0.01 --lda_rho 0.01 --lda_D 1000 -b 13 --minibatch 128 train-sets/wiki1K.dat
     train-sets/ref/wiki1K.stdout
     train-sets/ref/wiki1K.stderr