Merge pull request #2 from PaddlePaddle/develop

rebase
PaddlePaddle · Dec 5, 2016 · 5777fae · 5777fae
2 parents 021b3a4 + f93af82
commit 5777fae
Show file tree

Hide file tree

Showing 24 changed files with 1,349 additions and 67 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -50,7 +50,7 @@ before_install:
     fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
-  - pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy
+  - pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy sphinx_rtd_theme
 script:
   - paddle/scripts/travis/main.sh
 notifications:

diff --git a/demo/image_classification/predict.sh b/demo/image_classification/predict.sh
diff --git a/demo/semantic_role_labeling/predict.sh b/demo/semantic_role_labeling/predict.sh
diff --git a/demo/semantic_role_labeling/test.sh b/demo/semantic_role_labeling/test.sh
diff --git a/demo/semantic_role_labeling/train.sh b/demo/semantic_role_labeling/train.sh
diff --git a/demo/seqToseq/dataprovider.py b/demo/seqToseq/dataprovider.py
@@ -19,27 +19,44 @@
 END = "<e>"
 
 
-def hook(settings, src_dict, trg_dict, file_list, **kwargs):
+def hook(settings, src_dict_path, trg_dict_path, is_generating, file_list,
+         **kwargs):
     # job_mode = 1: training mode
     # job_mode = 0: generating mode
-    settings.job_mode = trg_dict is not None
-    settings.src_dict = src_dict
+    settings.job_mode = not is_generating
+    settings.src_dict = dict()
+    with open(src_dict_path, "r") as fin:
+        settings.src_dict = {
+            line.strip(): line_count
+            for line_count, line in enumerate(fin)
+        }
+    settings.trg_dict = dict()
+    with open(trg_dict_path, "r") as fin:
+        settings.trg_dict = {
+            line.strip(): line_count
+            for line_count, line in enumerate(fin)
+        }
+
     settings.logger.info("src dict len : %d" % (len(settings.src_dict)))
     settings.sample_count = 0
 
     if settings.job_mode:
-        settings.trg_dict = trg_dict
-        settings.slots = [
+        settings.slots = {
+            'source_language_word':
             integer_value_sequence(len(settings.src_dict)),
+            'target_language_word':
             integer_value_sequence(len(settings.trg_dict)),
+            'target_language_next_word':
             integer_value_sequence(len(settings.trg_dict))
-        ]
+        }
         settings.logger.info("trg dict len : %d" % (len(settings.trg_dict)))
     else:
-        settings.slots = [
+        settings.slots = {
+            'source_language_word':
             integer_value_sequence(len(settings.src_dict)),
+            'sent_id':
             integer_value_sequence(len(open(file_list[0], "r").readlines()))
-        ]
+        }
 
 
 def _get_ids(s, dictionary):
@@ -69,6 +86,10 @@ def process(settings, file_name):
                     continue
                 trg_ids_next = trg_ids + [settings.trg_dict[END]]
                 trg_ids = [settings.trg_dict[START]] + trg_ids
-                yield src_ids, trg_ids, trg_ids_next
+                yield {
+                    'source_language_word': src_ids,
+                    'target_language_word': trg_ids,
+                    'target_language_next_word': trg_ids_next
+                }
             else:
-                yield src_ids, [line_count]
+                yield {'source_language_word': src_ids, 'sent_id': [line_count]}
diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py
@@ -37,17 +37,10 @@ def seq_to_seq_data(data_dir,
     """
     src_lang_dict = os.path.join(data_dir, 'src.dict')
     trg_lang_dict = os.path.join(data_dir, 'trg.dict')
-    src_dict = dict()
-    for line_count, line in enumerate(open(src_lang_dict, "r")):
-        src_dict[line.strip()] = line_count
-    trg_dict = dict()
-    for line_count, line in enumerate(open(trg_lang_dict, "r")):
-        trg_dict[line.strip()] = line_count
 
     if is_generating:
         train_list = None
         test_list = os.path.join(data_dir, gen_list)
-        trg_dict = None
     else:
         train_list = os.path.join(data_dir, train_list)
         test_list = os.path.join(data_dir, test_list)
@@ -57,8 +50,11 @@ def seq_to_seq_data(data_dir,
         test_list,
         module="dataprovider",
         obj="process",
-        args={"src_dict": src_dict,
-              "trg_dict": trg_dict})
+        args={
+            "src_dict_path": src_lang_dict,
+            "trg_dict_path": trg_lang_dict,
+            "is_generating": is_generating
+        })
 
     return {
         "src_dict_path": src_lang_dict,

diff --git a/doc/conf.py.in b/doc/conf.py.in
@@ -23,7 +23,7 @@ AutoStructify = transform.AutoStructify
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 sys.path.insert(0, '@PROJ_ROOT@/python')
 
-templates_path = ["@PROJ_ROOT@/doc/templates"]
+templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
 
 # -- General configuration ------------------------------------------------
 
@@ -113,13 +113,12 @@ todo_include_todos = False
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-#html_theme = 'sphinx_rtd_theme'
-html_theme = 'classic'
+html_theme = 'sphinx_rtd_theme'
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ['@PROJ_ROOT@/doc_theme/static']
 
 # Output file base name for HTML help builder.
 htmlhelp_basename = project + 'doc'

diff --git a/doc/howto/cmd_parameter/arguments.md b/doc/howto/cmd_parameter/arguments.md
@@ -143,7 +143,7 @@ It looks like there are a lot of arguments. However, most of them are for develo
 </tr>
 
 <tr>
-<td class="left" rowspan = "2">testing during training</td><td class="left">test_all_data_in_one_period</td>
+<td class="left" rowspan = "2">testing during training</td><td class="left">test_period</td>
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
 </tr>
 

diff --git a/doc/howto/cmd_parameter/detail_introduction.md b/doc/howto/cmd_parameter/detail_introduction.md
@@ -31,7 +31,7 @@
   - type: string (default: null).
 
 * `--version`
-  - Whether to print version infomatrion.
+  - Whether to print version information.
   - type: bool (default: 0).
 
 * `--show_layer_stat`
@@ -110,8 +110,8 @@
   - type: int32 (default: -1).
 
 * `--test_period`
-  - Run testing every test_period train batches. If not set, run testing each pass.
-  - type: int32 (default: 1000).
+   - if equal 0, do test on all test data at the end of each pass. While if equal non-zero, do test on all test data every test_period batches.
+  - type: int32 (default: 0).
 
 * `--test_wait`
   - Whether to wait for parameter per pass if not exist. If set test_data_path in submitting environment of cluster, it will launch one process to perfom testing, so we need to set test_wait=1. Note that in the cluster submitting environment, this argument has been set True by default.
@@ -121,10 +121,6 @@
   - File that saves the model list when testing. It was set automatically when using cluster submitting environment after setting model_path.
   - type: string (default: "", null).
 
-* `--test_all_data_in_one_period`
-  - This argument is usually used in testing period during traning. If true, all data will be tested in one test period. Otherwise (batch_size * log_peroid) data will be tested.
-  - type: bool (default: 0).
-
 * `--predict_output_dir`
   - Directory that saves the layer output. It is configured in Outputs() in network config. Default, this argument is null, meaning save nothing. Specify this directory if you want to save feature map of some layers in testing mode. Note that, layer outputs are values after activation function.
   - type: string (default: "", null).

diff --git a/doc/howto/cmd_parameter/use_case.md b/doc/howto/cmd_parameter/use_case.md
@@ -10,9 +10,8 @@ paddle train \
   --config=network_config \
   --save_dir=output \
   --trainer_count=COUNT \                #(default:1)
-  --test_period=M \                      #(default:1000）
-  --test_all_data_in_one_period=true \   #(default:false) 
-  --num_passes=N \                       #(defalut:100）
+  --test_period=M \                      #(default:0) 
+  --num_passes=N \                       #(defalut:100)
   --log_period=K \                       #(default:100)
   --dot_period=1000 \                    #(default:1)
   #[--show_parameter_stats_period=100] \ #(default:0)

diff --git a/doc_cn/conf.py.in b/doc_cn/conf.py.in
@@ -22,7 +22,7 @@ AutoStructify = transform.AutoStructify
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 sys.path.insert(0, '@PROJ_ROOT@/python')
-templates_path = ["@PROJ_ROOT@/doc/templates"]
+templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
 
 # -- General configuration ------------------------------------------------
 
@@ -112,12 +112,12 @@ todo_include_todos = False
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-#html_theme = 'sphinx_rtd_theme'  # sphinx_rtd_theme will cause table bad style
-html_theme = 'classic'
+html_theme = 'sphinx_rtd_theme'
+
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ['@PROJ_ROOT@/doc_theme/static']
 
 # Output file base name for HTML help builder.
 htmlhelp_basename = project + 'doc'

diff --git a/doc_cn/faq/index.rst b/doc_cn/faq/index.rst
@@ -214,3 +214,41 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
         cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path>  -DPYTHON_INCLUDE_DIR=<inc_path>
 
 用户需要指定本机上Python的路径：``<exc_path>``, ``<lib_path>``, ``<inc_path>``
+
+10. A protocol message was rejected because it was too big
+----------------------------------------------------------
+
+如果在训练NLP相关模型时，出现以下错误：
+
+..  code-block:: bash
+
+    [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes).  To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
+    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr) 
+
+可能的原因是：传给dataprovider的某一个args过大，一般是由于直接传递大字典导致的。错误的define_py_data_sources2类似：
+
+..  code-block:: python
+
+     src_dict = dict()
+     for line_count, line in enumerate(open(src_dict_path, "r")):
+        src_dict[line.strip()] = line_count
+
+     define_py_data_sources2(
+        train_list,
+        test_list,
+        module="dataprovider",
+        obj="process",
+        args={"src_dict": src_dict})
+
+解决方案是：将字典的地址作为args传给dataprovider，然后在dataprovider里面根据该地址加载字典。即define_py_data_sources2应改为：
+
+..  code-block:: python
+
+     define_py_data_sources2(
+        train_list,
+        test_list,
+        module="dataprovider",
+        obj="process",
+        args={"src_dict_path": src_dict_path})
+
+完整源码可参考 `seqToseq <https://github.com/PaddlePaddle/Paddle/tree/develop/demo/seqToseq>`_ 示例。