Skip to content

Commit

Permalink
Merge pull request #2 from PaddlePaddle/develop
Browse files Browse the repository at this point in the history
rebase
  • Loading branch information
hohdiy authored Dec 5, 2016
2 parents 021b3a4 + f93af82 commit 5777fae
Show file tree
Hide file tree
Showing 24 changed files with 1,349 additions and 67 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ before_install:
fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
- pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy
- pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy sphinx_rtd_theme
script:
- paddle/scripts/travis/main.sh
notifications:
Expand Down
Empty file modified demo/image_classification/predict.sh
100644 → 100755
Empty file.
Empty file modified demo/semantic_role_labeling/predict.sh
100644 → 100755
Empty file.
Empty file modified demo/semantic_role_labeling/test.sh
100644 → 100755
Empty file.
Empty file modified demo/semantic_role_labeling/train.sh
100644 → 100755
Empty file.
41 changes: 31 additions & 10 deletions demo/seqToseq/dataprovider.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,27 +19,44 @@
END = "<e>"


def hook(settings, src_dict, trg_dict, file_list, **kwargs):
def hook(settings, src_dict_path, trg_dict_path, is_generating, file_list,
**kwargs):
# job_mode = 1: training mode
# job_mode = 0: generating mode
settings.job_mode = trg_dict is not None
settings.src_dict = src_dict
settings.job_mode = not is_generating
settings.src_dict = dict()
with open(src_dict_path, "r") as fin:
settings.src_dict = {
line.strip(): line_count
for line_count, line in enumerate(fin)
}
settings.trg_dict = dict()
with open(trg_dict_path, "r") as fin:
settings.trg_dict = {
line.strip(): line_count
for line_count, line in enumerate(fin)
}

settings.logger.info("src dict len : %d" % (len(settings.src_dict)))
settings.sample_count = 0

if settings.job_mode:
settings.trg_dict = trg_dict
settings.slots = [
settings.slots = {
'source_language_word':
integer_value_sequence(len(settings.src_dict)),
'target_language_word':
integer_value_sequence(len(settings.trg_dict)),
'target_language_next_word':
integer_value_sequence(len(settings.trg_dict))
]
}
settings.logger.info("trg dict len : %d" % (len(settings.trg_dict)))
else:
settings.slots = [
settings.slots = {
'source_language_word':
integer_value_sequence(len(settings.src_dict)),
'sent_id':
integer_value_sequence(len(open(file_list[0], "r").readlines()))
]
}


def _get_ids(s, dictionary):
Expand Down Expand Up @@ -69,6 +86,10 @@ def process(settings, file_name):
continue
trg_ids_next = trg_ids + [settings.trg_dict[END]]
trg_ids = [settings.trg_dict[START]] + trg_ids
yield src_ids, trg_ids, trg_ids_next
yield {
'source_language_word': src_ids,
'target_language_word': trg_ids,
'target_language_next_word': trg_ids_next
}
else:
yield src_ids, [line_count]
yield {'source_language_word': src_ids, 'sent_id': [line_count]}
14 changes: 5 additions & 9 deletions demo/seqToseq/seqToseq_net.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,10 @@ def seq_to_seq_data(data_dir,
"""
src_lang_dict = os.path.join(data_dir, 'src.dict')
trg_lang_dict = os.path.join(data_dir, 'trg.dict')
src_dict = dict()
for line_count, line in enumerate(open(src_lang_dict, "r")):
src_dict[line.strip()] = line_count
trg_dict = dict()
for line_count, line in enumerate(open(trg_lang_dict, "r")):
trg_dict[line.strip()] = line_count

if is_generating:
train_list = None
test_list = os.path.join(data_dir, gen_list)
trg_dict = None
else:
train_list = os.path.join(data_dir, train_list)
test_list = os.path.join(data_dir, test_list)
Expand All @@ -57,8 +50,11 @@ def seq_to_seq_data(data_dir,
test_list,
module="dataprovider",
obj="process",
args={"src_dict": src_dict,
"trg_dict": trg_dict})
args={
"src_dict_path": src_lang_dict,
"trg_dict_path": trg_lang_dict,
"is_generating": is_generating
})

return {
"src_dict_path": src_lang_dict,
Expand Down
7 changes: 3 additions & 4 deletions doc/conf.py.in
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ AutoStructify = transform.AutoStructify
# documentation root, use os.path.abspath to make it absolute, like shown here.
sys.path.insert(0, '@PROJ_ROOT@/python')

templates_path = ["@PROJ_ROOT@/doc/templates"]
templates_path = ["@PROJ_ROOT@/doc_theme/templates"]

# -- General configuration ------------------------------------------------

Expand Down Expand Up @@ -113,13 +113,12 @@ todo_include_todos = False

# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#html_theme = 'sphinx_rtd_theme'
html_theme = 'classic'
html_theme = 'sphinx_rtd_theme'

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
html_static_path = ['@PROJ_ROOT@/doc_theme/static']

# Output file base name for HTML help builder.
htmlhelp_basename = project + 'doc'
Expand Down
2 changes: 1 addition & 1 deletion doc/howto/cmd_parameter/arguments.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ It looks like there are a lot of arguments. However, most of them are for develo
</tr>

<tr>
<td class="left" rowspan = "2">testing during training</td><td class="left">test_all_data_in_one_period</td>
<td class="left" rowspan = "2">testing during training</td><td class="left">test_period</td>
<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
</tr>

Expand Down
10 changes: 3 additions & 7 deletions doc/howto/cmd_parameter/detail_introduction.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
- type: string (default: null).

* `--version`
- Whether to print version infomatrion.
- Whether to print version information.
- type: bool (default: 0).

* `--show_layer_stat`
Expand Down Expand Up @@ -110,8 +110,8 @@
- type: int32 (default: -1).

* `--test_period`
- Run testing every test_period train batches. If not set, run testing each pass.
- type: int32 (default: 1000).
- if equal 0, do test on all test data at the end of each pass. While if equal non-zero, do test on all test data every test_period batches.
- type: int32 (default: 0).

* `--test_wait`
- Whether to wait for parameter per pass if not exist. If set test_data_path in submitting environment of cluster, it will launch one process to perfom testing, so we need to set test_wait=1. Note that in the cluster submitting environment, this argument has been set True by default.
Expand All @@ -121,10 +121,6 @@
- File that saves the model list when testing. It was set automatically when using cluster submitting environment after setting model_path.
- type: string (default: "", null).

* `--test_all_data_in_one_period`
- This argument is usually used in testing period during traning. If true, all data will be tested in one test period. Otherwise (batch_size * log_peroid) data will be tested.
- type: bool (default: 0).

* `--predict_output_dir`
- Directory that saves the layer output. It is configured in Outputs() in network config. Default, this argument is null, meaning save nothing. Specify this directory if you want to save feature map of some layers in testing mode. Note that, layer outputs are values after activation function.
- type: string (default: "", null).
Expand Down
5 changes: 2 additions & 3 deletions doc/howto/cmd_parameter/use_case.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ paddle train \
--config=network_config \
--save_dir=output \
--trainer_count=COUNT \ #(default:1)
--test_period=M \ #(default:1000)
--test_all_data_in_one_period=true \ #(default:false)
--num_passes=N \ #(defalut:100)
--test_period=M \ #(default:0)
--num_passes=N \ #(defalut:100)
--log_period=K \ #(default:100)
--dot_period=1000 \ #(default:1)
#[--show_parameter_stats_period=100] \ #(default:0)
Expand Down
8 changes: 4 additions & 4 deletions doc_cn/conf.py.in
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ AutoStructify = transform.AutoStructify
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
sys.path.insert(0, '@PROJ_ROOT@/python')
templates_path = ["@PROJ_ROOT@/doc/templates"]
templates_path = ["@PROJ_ROOT@/doc_theme/templates"]

# -- General configuration ------------------------------------------------

Expand Down Expand Up @@ -112,12 +112,12 @@ todo_include_todos = False

# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#html_theme = 'sphinx_rtd_theme' # sphinx_rtd_theme will cause table bad style
html_theme = 'classic'
html_theme = 'sphinx_rtd_theme'

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
html_static_path = ['@PROJ_ROOT@/doc_theme/static']

# Output file base name for HTML help builder.
htmlhelp_basename = project + 'doc'
Expand Down
38 changes: 38 additions & 0 deletions doc_cn/faq/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -214,3 +214,41 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID,相同名字
cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path> -DPYTHON_INCLUDE_DIR=<inc_path>
用户需要指定本机上Python的路径:``<exc_path>``, ``<lib_path>``, ``<inc_path>``

10. A protocol message was rejected because it was too big
----------------------------------------------------------

如果在训练NLP相关模型时,出现以下错误:

.. code-block:: bash
[libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes). To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr)
可能的原因是:传给dataprovider的某一个args过大,一般是由于直接传递大字典导致的。错误的define_py_data_sources2类似:

.. code-block:: python
src_dict = dict()
for line_count, line in enumerate(open(src_dict_path, "r")):
src_dict[line.strip()] = line_count
define_py_data_sources2(
train_list,
test_list,
module="dataprovider",
obj="process",
args={"src_dict": src_dict})
解决方案是:将字典的地址作为args传给dataprovider,然后在dataprovider里面根据该地址加载字典。即define_py_data_sources2应改为:

.. code-block:: python
define_py_data_sources2(
train_list,
test_list,
module="dataprovider",
obj="process",
args={"src_dict_path": src_dict_path})
完整源码可参考 `seqToseq <https://github.com/PaddlePaddle/Paddle/tree/develop/demo/seqToseq>`_ 示例。
Loading

0 comments on commit 5777fae

Please sign in to comment.