From cc441b7b8bf2f80b17cf44acbcb425d529876ff1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Fri, 22 Jun 2018 03:36:04 +0200 Subject: [PATCH] Fix documentation for various modules (#2096) * doc fixes * doc fixes to matutils * docsim doc fixes * doc fixes to interfaces module * doc fixes to Dictionary * doc fixes to MatrixMarket classes * doc fixes to WikiCorpus * minor code style changes in HashDictionary * fixing TfidfModel bugs + docs * fixes to phrases docs * fix PEP8 * fix documentation building * cleanup mmcorpus-related * cleanup dictionary * cleanup hashdictionary * cleanup wikicorpus * cleanup interfaces * cleanup matutils * rename smartirs signature * minor docs style fixes * regenerate *.c for mmreader (after last Radim fix) * fix bool parameters * regenerate _mmreader.c again * cleanup phrases * cleanup utils * Fix paper for phrases according to #2098, catch by @davidchall * cleanup docsim * - cleanup tfidfmodel - fix bug in smartirs_normalize (old version correct!) - remove persistence test & remove old models from repo (by rename reason) * typo fix * add back smartirs tests * retrying saved test files --- gensim/corpora/_mmreader.c | 2008 +++++++++++---------- gensim/corpora/_mmreader.pyx | 32 +- gensim/corpora/dictionary.py | 111 +- gensim/corpora/hashdictionary.py | 87 +- gensim/corpora/mmcorpus.py | 54 +- gensim/corpora/wikicorpus.py | 104 +- gensim/interfaces.py | 149 +- gensim/matutils.py | 293 +-- gensim/models/phrases.py | 104 +- gensim/models/tfidfmodel.py | 164 +- gensim/similarities/docsim.py | 193 +- gensim/sklearn_api/phrases.py | 11 +- gensim/test/test_data/tfidf_model.tst | Bin 1261 -> 909 bytes gensim/test/test_data/tfidf_model.tst.bz2 | Bin 822 -> 622 bytes gensim/utils.py | 426 +++-- 15 files changed, 1963 insertions(+), 1773 deletions(-) diff --git a/gensim/corpora/_mmreader.c b/gensim/corpora/_mmreader.c index a54b7d1d12..2adbf9d95d 100644 --- a/gensim/corpora/_mmreader.c +++ b/gensim/corpora/_mmreader.c @@ -1,4 +1,4 @@ -/* Generated by Cython 0.27.3 */ +/* Generated by Cython 0.28.3 */ #define PY_SSIZE_T_CLEAN #include "Python.h" @@ -7,7 +7,7 @@ #elif PY_VERSION_HEX < 0x02060000 || (0x03000000 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x03030000) #error Cython requires Python 2.6+ or Python 3.3+. #else -#define CYTHON_ABI "0_27_3" +#define CYTHON_ABI "0_28_3" #define CYTHON_FUTURE_DIVISION 0 #include #ifndef offsetof @@ -183,6 +183,103 @@ #undef BASE #undef MASK #endif +#ifndef __has_attribute + #define __has_attribute(x) 0 +#endif +#ifndef __has_cpp_attribute + #define __has_cpp_attribute(x) 0 +#endif +#ifndef CYTHON_RESTRICT + #if defined(__GNUC__) + #define CYTHON_RESTRICT __restrict__ + #elif defined(_MSC_VER) && _MSC_VER >= 1400 + #define CYTHON_RESTRICT __restrict + #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + #define CYTHON_RESTRICT restrict + #else + #define CYTHON_RESTRICT + #endif +#endif +#ifndef CYTHON_UNUSED +# if defined(__GNUC__) +# if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) +# define CYTHON_UNUSED __attribute__ ((__unused__)) +# else +# define CYTHON_UNUSED +# endif +# elif defined(__ICC) || (defined(__INTEL_COMPILER) && !defined(_MSC_VER)) +# define CYTHON_UNUSED __attribute__ ((__unused__)) +# else +# define CYTHON_UNUSED +# endif +#endif +#ifndef CYTHON_MAYBE_UNUSED_VAR +# if defined(__cplusplus) + template void CYTHON_MAYBE_UNUSED_VAR( const T& ) { } +# else +# define CYTHON_MAYBE_UNUSED_VAR(x) (void)(x) +# endif +#endif +#ifndef CYTHON_NCP_UNUSED +# if CYTHON_COMPILING_IN_CPYTHON +# define CYTHON_NCP_UNUSED +# else +# define CYTHON_NCP_UNUSED CYTHON_UNUSED +# endif +#endif +#define __Pyx_void_to_None(void_result) ((void)(void_result), Py_INCREF(Py_None), Py_None) +#ifdef _MSC_VER + #ifndef _MSC_STDINT_H_ + #if _MSC_VER < 1300 + typedef unsigned char uint8_t; + typedef unsigned int uint32_t; + #else + typedef unsigned __int8 uint8_t; + typedef unsigned __int32 uint32_t; + #endif + #endif +#else + #include +#endif +#ifndef CYTHON_FALLTHROUGH + #if defined(__cplusplus) && __cplusplus >= 201103L + #if __has_cpp_attribute(fallthrough) + #define CYTHON_FALLTHROUGH [[fallthrough]] + #elif __has_cpp_attribute(clang::fallthrough) + #define CYTHON_FALLTHROUGH [[clang::fallthrough]] + #elif __has_cpp_attribute(gnu::fallthrough) + #define CYTHON_FALLTHROUGH [[gnu::fallthrough]] + #endif + #endif + #ifndef CYTHON_FALLTHROUGH + #if __has_attribute(fallthrough) + #define CYTHON_FALLTHROUGH __attribute__((fallthrough)) + #else + #define CYTHON_FALLTHROUGH + #endif + #endif + #if defined(__clang__ ) && defined(__apple_build_version__) + #if __apple_build_version__ < 7000000 + #undef CYTHON_FALLTHROUGH + #define CYTHON_FALLTHROUGH + #endif + #endif +#endif + +#ifndef CYTHON_INLINE + #if defined(__clang__) + #define CYTHON_INLINE __inline__ __attribute__ ((__unused__)) + #elif defined(__GNUC__) + #define CYTHON_INLINE __inline__ + #elif defined(_MSC_VER) + #define CYTHON_INLINE __inline + #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + #define CYTHON_INLINE inline + #else + #define CYTHON_INLINE + #endif +#endif + #if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX < 0x02070600 && !defined(Py_OptimizeFlag) #define Py_OptimizeFlag 0 #endif @@ -211,12 +308,12 @@ #ifndef Py_TPFLAGS_HAVE_FINALIZE #define Py_TPFLAGS_HAVE_FINALIZE 0 #endif -#if PY_VERSION_HEX < 0x030700A0 || !defined(METH_FASTCALL) +#if PY_VERSION_HEX <= 0x030700A3 || !defined(METH_FASTCALL) #ifndef METH_FASTCALL #define METH_FASTCALL 0x80 #endif - typedef PyObject *(*__Pyx_PyCFunctionFast) (PyObject *self, PyObject **args, Py_ssize_t nargs); - typedef PyObject *(*__Pyx_PyCFunctionFastWithKeywords) (PyObject *self, PyObject **args, + typedef PyObject *(*__Pyx_PyCFunctionFast) (PyObject *self, PyObject *const *args, Py_ssize_t nargs); + typedef PyObject *(*__Pyx_PyCFunctionFastWithKeywords) (PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames); #else #define __Pyx_PyCFunctionFast _PyCFunctionFast @@ -228,6 +325,18 @@ #else #define __Pyx_PyFastCFunction_Check(func) 0 #endif +#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Malloc) + #define PyObject_Malloc(s) PyMem_Malloc(s) + #define PyObject_Free(p) PyMem_Free(p) + #define PyObject_Realloc(p) PyMem_Realloc(p) +#endif +#if CYTHON_COMPILING_IN_PYSTON + #define __Pyx_PyCode_HasFreeVars(co) PyCode_HasFreeVars(co) + #define __Pyx_PyFrame_SetLineNumber(frame, lineno) PyFrame_SetLineNumber(frame, lineno) +#else + #define __Pyx_PyCode_HasFreeVars(co) (PyCode_GetNumFree(co) > 0) + #define __Pyx_PyFrame_SetLineNumber(frame, lineno) (frame)->f_lineno = (lineno) +#endif #if !CYTHON_FAST_THREAD_STATE || PY_VERSION_HEX < 0x02070000 #define __Pyx_PyThreadState_Current PyThreadState_GET() #elif PY_VERSION_HEX >= 0x03060000 @@ -237,6 +346,36 @@ #else #define __Pyx_PyThreadState_Current _PyThreadState_Current #endif +#if PY_VERSION_HEX < 0x030700A2 && !defined(PyThread_tss_create) && !defined(Py_tss_NEEDS_INIT) +#include "pythread.h" +#define Py_tss_NEEDS_INIT 0 +typedef int Py_tss_t; +static CYTHON_INLINE int PyThread_tss_create(Py_tss_t *key) { + *key = PyThread_create_key(); + return 0; // PyThread_create_key reports success always +} +static CYTHON_INLINE Py_tss_t * PyThread_tss_alloc(void) { + Py_tss_t *key = (Py_tss_t *)PyObject_Malloc(sizeof(Py_tss_t)); + *key = Py_tss_NEEDS_INIT; + return key; +} +static CYTHON_INLINE void PyThread_tss_free(Py_tss_t *key) { + PyObject_Free(key); +} +static CYTHON_INLINE int PyThread_tss_is_created(Py_tss_t *key) { + return *key != Py_tss_NEEDS_INIT; +} +static CYTHON_INLINE void PyThread_tss_delete(Py_tss_t *key) { + PyThread_delete_key(*key); + *key = Py_tss_NEEDS_INIT; +} +static CYTHON_INLINE int PyThread_tss_set(Py_tss_t *key, void *value) { + return PyThread_set_key_value(*key, value); +} +static CYTHON_INLINE void * PyThread_tss_get(Py_tss_t *key) { + return PyThread_get_key_value(*key); +} +#endif // TSS (Thread Specific Storage) API #if CYTHON_COMPILING_IN_CPYTHON || defined(_PyDict_NewPresized) #define __Pyx_PyDict_NewPresized(n) ((n <= 8) ? PyDict_New() : _PyDict_NewPresized(n)) #else @@ -249,6 +388,11 @@ #define __Pyx_PyNumber_Divide(x,y) PyNumber_Divide(x,y) #define __Pyx_PyNumber_InPlaceDivide(x,y) PyNumber_InPlaceDivide(x,y) #endif +#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030500A1 && CYTHON_USE_UNICODE_INTERNALS +#define __Pyx_PyDict_GetItemStr(dict, name) _PyDict_GetItem_KnownHash(dict, name, ((PyASCIIObject *) name)->hash) +#else +#define __Pyx_PyDict_GetItemStr(dict, name) PyDict_GetItem(dict, name) +#endif #if PY_VERSION_HEX > 0x03030000 && defined(PyUnicode_KIND) #define CYTHON_PEP393_ENABLED 1 #define __Pyx_PyUnicode_READY(op) (likely(PyUnicode_IS_READY(op)) ?\ @@ -293,18 +437,6 @@ #if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Format) #define PyObject_Format(obj, fmt) PyObject_CallMethod(obj, "__format__", "O", fmt) #endif -#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Malloc) - #define PyObject_Malloc(s) PyMem_Malloc(s) - #define PyObject_Free(p) PyMem_Free(p) - #define PyObject_Realloc(p) PyMem_Realloc(p) -#endif -#if CYTHON_COMPILING_IN_PYSTON - #define __Pyx_PyCode_HasFreeVars(co) PyCode_HasFreeVars(co) - #define __Pyx_PyFrame_SetLineNumber(frame, lineno) PyFrame_SetLineNumber(frame, lineno) -#else - #define __Pyx_PyCode_HasFreeVars(co) (PyCode_GetNumFree(co) > 0) - #define __Pyx_PyFrame_SetLineNumber(frame, lineno) (frame)->f_lineno = (lineno) -#endif #define __Pyx_PyString_FormatSafe(a, b) ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : __Pyx_PyString_Format(a, b)) #define __Pyx_PyUnicode_FormatSafe(a, b) ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : PyUnicode_Format(a, b)) #if PY_MAJOR_VERSION >= 3 @@ -321,6 +453,7 @@ #define PyString_Type PyUnicode_Type #define PyString_Check PyUnicode_Check #define PyString_CheckExact PyUnicode_CheckExact + #define PyObject_Unicode PyObject_Str #endif #if PY_MAJOR_VERSION >= 3 #define __Pyx_PyBaseString_Check(obj) PyUnicode_Check(obj) @@ -332,7 +465,11 @@ #ifndef PySet_CheckExact #define PySet_CheckExact(obj) (Py_TYPE(obj) == &PySet_Type) #endif -#define __Pyx_PyException_Check(obj) __Pyx_TypeCheck(obj, PyExc_Exception) +#if CYTHON_ASSUME_SAFE_MACROS + #define __Pyx_PySequence_SIZE(seq) Py_SIZE(seq) +#else + #define __Pyx_PySequence_SIZE(seq) PySequence_Size(seq) +#endif #if PY_MAJOR_VERSION >= 3 #define PyIntObject PyLongObject #define PyInt_Type PyLong_Type @@ -367,16 +504,10 @@ #define __Pyx_PyInt_AsHash_t PyInt_AsSsize_t #endif #if PY_MAJOR_VERSION >= 3 - #define __Pyx_PyMethod_New(func, self, klass) ((self) ? PyMethod_New(func, self) : PyInstanceMethod_New(func)) + #define __Pyx_PyMethod_New(func, self, klass) ((self) ? PyMethod_New(func, self) : (Py_INCREF(func), func)) #else #define __Pyx_PyMethod_New(func, self, klass) PyMethod_New(func, self, klass) #endif -#ifndef __has_attribute - #define __has_attribute(x) 0 -#endif -#ifndef __has_cpp_attribute - #define __has_cpp_attribute(x) 0 -#endif #if CYTHON_USE_ASYNC_SLOTS #if PY_VERSION_HEX >= 0x030500B1 #define __Pyx_PyAsyncMethodsStruct PyAsyncMethods @@ -394,96 +525,6 @@ unaryfunc am_anext; } __Pyx_PyAsyncMethodsStruct; #endif -#ifndef CYTHON_RESTRICT - #if defined(__GNUC__) - #define CYTHON_RESTRICT __restrict__ - #elif defined(_MSC_VER) && _MSC_VER >= 1400 - #define CYTHON_RESTRICT __restrict - #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L - #define CYTHON_RESTRICT restrict - #else - #define CYTHON_RESTRICT - #endif -#endif -#ifndef CYTHON_UNUSED -# if defined(__GNUC__) -# if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) -# define CYTHON_UNUSED __attribute__ ((__unused__)) -# else -# define CYTHON_UNUSED -# endif -# elif defined(__ICC) || (defined(__INTEL_COMPILER) && !defined(_MSC_VER)) -# define CYTHON_UNUSED __attribute__ ((__unused__)) -# else -# define CYTHON_UNUSED -# endif -#endif -#ifndef CYTHON_MAYBE_UNUSED_VAR -# if defined(__cplusplus) - template void CYTHON_MAYBE_UNUSED_VAR( const T& ) { } -# else -# define CYTHON_MAYBE_UNUSED_VAR(x) (void)(x) -# endif -#endif -#ifndef CYTHON_NCP_UNUSED -# if CYTHON_COMPILING_IN_CPYTHON -# define CYTHON_NCP_UNUSED -# else -# define CYTHON_NCP_UNUSED CYTHON_UNUSED -# endif -#endif -#define __Pyx_void_to_None(void_result) ((void)(void_result), Py_INCREF(Py_None), Py_None) -#ifdef _MSC_VER - #ifndef _MSC_STDINT_H_ - #if _MSC_VER < 1300 - typedef unsigned char uint8_t; - typedef unsigned int uint32_t; - #else - typedef unsigned __int8 uint8_t; - typedef unsigned __int32 uint32_t; - #endif - #endif -#else - #include -#endif -#ifndef CYTHON_FALLTHROUGH - #if defined(__cplusplus) && __cplusplus >= 201103L - #if __has_cpp_attribute(fallthrough) - #define CYTHON_FALLTHROUGH [[fallthrough]] - #elif __has_cpp_attribute(clang::fallthrough) - #define CYTHON_FALLTHROUGH [[clang::fallthrough]] - #elif __has_cpp_attribute(gnu::fallthrough) - #define CYTHON_FALLTHROUGH [[gnu::fallthrough]] - #endif - #endif - #ifndef CYTHON_FALLTHROUGH - #if __has_attribute(fallthrough) - #define CYTHON_FALLTHROUGH __attribute__((fallthrough)) - #else - #define CYTHON_FALLTHROUGH - #endif - #endif - #if defined(__clang__ ) && defined(__apple_build_version__) - #if __apple_build_version__ < 7000000 - #undef CYTHON_FALLTHROUGH - #define CYTHON_FALLTHROUGH - #endif - #endif -#endif - -#ifndef CYTHON_INLINE - #if defined(__clang__) - #define CYTHON_INLINE __inline__ __attribute__ ((__unused__)) - #elif defined(__GNUC__) - #define CYTHON_INLINE __inline__ - #elif defined(_MSC_VER) - #define CYTHON_INLINE __inline - #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L - #define CYTHON_INLINE inline - #else - #define CYTHON_INLINE - #endif -#endif #if defined(WIN32) || defined(MS_WINDOWS) #define _USE_MATH_DEFINES @@ -520,6 +561,7 @@ static CYTHON_INLINE float __PYX_NAN() { #define __PYX_HAVE__gensim__corpora___mmreader #define __PYX_HAVE_API__gensim__corpora___mmreader +/* Early includes */ #include #include #ifdef _OPENMP @@ -606,7 +648,7 @@ static CYTHON_INLINE size_t __Pyx_Py_UNICODE_strlen(const Py_UNICODE *u) { #define __Pyx_PyUnicode_AsUnicode PyUnicode_AsUnicode #define __Pyx_NewRef(obj) (Py_INCREF(obj), obj) #define __Pyx_Owned_Py_None(b) __Pyx_NewRef(Py_None) -#define __Pyx_PyBool_FromLong(b) ((b) ? __Pyx_NewRef(Py_True) : __Pyx_NewRef(Py_False)) +static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b); static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*); static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x); #define __Pyx_PySequence_Tuple(obj)\ @@ -714,7 +756,7 @@ static CYTHON_INLINE void __Pyx_pretend_to_initialize(void* ptr) { (void)ptr; } static PyObject *__pyx_m = NULL; static PyObject *__pyx_d; static PyObject *__pyx_b; -static PyObject *__pyx_cython_runtime; +static PyObject *__pyx_cython_runtime = NULL; static PyObject *__pyx_empty_tuple; static PyObject *__pyx_empty_bytes; static PyObject *__pyx_empty_unicode; @@ -725,7 +767,7 @@ static const char *__pyx_filename; static const char *__pyx_f[] = { - "_mmreader.pyx", + "gensim/corpora/_mmreader.pyx", "stringsource", }; @@ -735,11 +777,11 @@ struct __pyx_obj_6gensim_7corpora_9_mmreader___pyx_scope_struct____init__; struct __pyx_obj_6gensim_7corpora_9_mmreader___pyx_scope_struct_1_genexpr; struct __pyx_obj_6gensim_7corpora_9_mmreader___pyx_scope_struct_2___iter__; -/* "gensim/corpora/_mmreader.pyx":19 +/* "gensim/corpora/_mmreader.pyx":21 * * * cdef class MmReader(object): # <<<<<<<<<<<<<< - * """Matrix market file reader (fast Cython version), used for :class:`~gensim.corpora.mmcorpus.MmCorpus`. + * """Matrix market file reader (fast Cython version), used internally in :class:`~gensim.corpora.mmcorpus.MmCorpus`. * */ struct __pyx_obj_6gensim_7corpora_9_mmreader_MmReader { @@ -752,7 +794,7 @@ struct __pyx_obj_6gensim_7corpora_9_mmreader_MmReader { }; -/* "gensim/corpora/_mmreader.pyx":45 +/* "gensim/corpora/_mmreader.pyx":47 * cdef public long long num_docs, num_terms, num_nnz * * def __init__(self, input, transposed=True): # <<<<<<<<<<<<<< @@ -765,7 +807,7 @@ struct __pyx_obj_6gensim_7corpora_9_mmreader___pyx_scope_struct____init__ { }; -/* "gensim/corpora/_mmreader.pyx":75 +/* "gensim/corpora/_mmreader.pyx":77 * line = utils.to_unicode(line) * if not line.startswith('%'): * self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) # <<<<<<<<<<<<<< @@ -782,11 +824,11 @@ struct __pyx_obj_6gensim_7corpora_9_mmreader___pyx_scope_struct_1_genexpr { }; -/* "gensim/corpora/_mmreader.pyx":107 +/* "gensim/corpora/_mmreader.pyx":109 * break * * def __iter__(self): # <<<<<<<<<<<<<< - * """Iterate through corpus. + * """Iterate through all documents in the corpus. * */ struct __pyx_obj_6gensim_7corpora_9_mmreader___pyx_scope_struct_2___iter__ { @@ -878,16 +920,7 @@ struct __pyx_obj_6gensim_7corpora_9_mmreader___pyx_scope_struct_2___iter__ { /* PyObjectGetAttrStr.proto */ #if CYTHON_USE_TYPE_SLOTS -static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name) { - PyTypeObject* tp = Py_TYPE(obj); - if (likely(tp->tp_getattro)) - return tp->tp_getattro(obj, attr_name); -#if PY_MAJOR_VERSION < 3 - if (likely(tp->tp_getattr)) - return tp->tp_getattr(obj, PyString_AS_STRING(attr_name)); -#endif - return PyObject_GetAttr(obj, attr_name); -} +static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name); #else #define __Pyx_PyObject_GetAttrStr(o,n) PyObject_GetAttr(o,n) #endif @@ -1137,6 +1170,20 @@ static CYTHON_INLINE int __Pyx_HasAttr(PyObject *, PyObject *); /* IncludeStringH.proto */ #include +/* PyObject_GenericGetAttrNoDict.proto */ +#if CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP && PY_VERSION_HEX < 0x03070000 +static CYTHON_INLINE PyObject* __Pyx_PyObject_GenericGetAttrNoDict(PyObject* obj, PyObject* attr_name); +#else +#define __Pyx_PyObject_GenericGetAttrNoDict PyObject_GenericGetAttr +#endif + +/* PyObject_GenericGetAttr.proto */ +#if CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP && PY_VERSION_HEX < 0x03070000 +static PyObject* __Pyx_PyObject_GenericGetAttr(PyObject* obj, PyObject* attr_name); +#else +#define __Pyx_PyObject_GenericGetAttr PyObject_GenericGetAttr +#endif + /* SetupReduce.proto */ static int __Pyx_setup_reduce(PyObject* type_obj); @@ -1192,6 +1239,7 @@ static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObj #define __Pyx_PyErr_GivenExceptionMatches(err, type) PyErr_GivenExceptionMatches(err, type) #define __Pyx_PyErr_GivenExceptionMatches2(err, type1, type2) (PyErr_GivenExceptionMatches(err, type1) || PyErr_GivenExceptionMatches(err, type2)) #endif +#define __Pyx_PyException_Check(obj) __Pyx_TypeCheck(obj, PyExc_Exception) /* FetchCommonType.proto */ static PyTypeObject* __Pyx_FetchCommonType(PyTypeObject* type); @@ -1223,14 +1271,15 @@ typedef struct { PyObject *gi_name; PyObject *gi_qualname; PyObject *gi_modulename; + PyObject *gi_code; int resume_label; char is_running; } __pyx_CoroutineObject; static __pyx_CoroutineObject *__Pyx__Coroutine_New( - PyTypeObject *type, __pyx_coroutine_body_t body, PyObject *closure, + PyTypeObject *type, __pyx_coroutine_body_t body, PyObject *code, PyObject *closure, PyObject *name, PyObject *qualname, PyObject *module_name); static __pyx_CoroutineObject *__Pyx__Coroutine_NewInit( - __pyx_CoroutineObject *gen, __pyx_coroutine_body_t body, PyObject *closure, + __pyx_CoroutineObject *gen, __pyx_coroutine_body_t body, PyObject *code, PyObject *closure, PyObject *name, PyObject *qualname, PyObject *module_name); static int __Pyx_Coroutine_clear(PyObject *self); static PyObject *__Pyx_Coroutine_Send(PyObject *self, PyObject *value); @@ -1264,8 +1313,8 @@ static int __Pyx_patch_abc(void); #define __Pyx_Generator_USED static PyTypeObject *__pyx_GeneratorType = 0; #define __Pyx_Generator_CheckExact(obj) (Py_TYPE(obj) == __pyx_GeneratorType) -#define __Pyx_Generator_New(body, closure, name, qualname, module_name)\ - __Pyx__Coroutine_New(__pyx_GeneratorType, body, closure, name, qualname, module_name) +#define __Pyx_Generator_New(body, code, closure, name, qualname, module_name)\ + __Pyx__Coroutine_New(__pyx_GeneratorType, body, code, closure, name, qualname, module_name) static PyObject *__Pyx_Generator_Next(PyObject *self); static int __pyx_Generator_init(void); @@ -1472,8 +1521,9 @@ static PyObject *__pyx_tuple__5; static PyObject *__pyx_tuple__6; static PyObject *__pyx_tuple__7; static PyObject *__pyx_codeobj__8; +/* Late includes */ -/* "gensim/corpora/_mmreader.pyx":45 +/* "gensim/corpora/_mmreader.pyx":47 * cdef public long long num_docs, num_terms, num_nnz * * def __init__(self, input, transposed=True): # <<<<<<<<<<<<<< @@ -1483,7 +1533,7 @@ static PyObject *__pyx_codeobj__8; /* Python wrapper */ static int __pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_1__init__(PyObject *__pyx_v_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ -static char __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader___init__[] = "\n\n Parameters\n ----------\n input : {str, file-like object}\n Path to input file in MM format or a file-like object that supports `seek()`\n (e.g. :class:`~gzip.GzipFile`, :class:`~bz2.BZ2File`).\n\n transposed : bool, optional\n if True, expects lines to represent doc_id, term_id, value. Else, expects term_id, doc_id, value.\n\n "; +static char __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader___init__[] = "\n\n Parameters\n ----------\n input : {str, file-like object}\n Path to the input file in MM format or a file-like object that supports `seek()`\n (e.g. smart_open objects).\n\n transposed : bool, optional\n Do lines represent `doc_id, term_id, value`, instead of `term_id, doc_id, value`?\n\n "; #if CYTHON_COMPILING_IN_CPYTHON struct wrapperbase __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader___init__; #endif @@ -1511,17 +1561,17 @@ static int __pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_1__init__(PyObject *__ kw_args = PyDict_Size(__pyx_kwds); switch (pos_args) { case 0: - if (likely((values[0] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_input)) != 0)) kw_args--; + if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_input)) != 0)) kw_args--; else goto __pyx_L5_argtuple_error; CYTHON_FALLTHROUGH; case 1: if (kw_args > 0) { - PyObject* value = PyDict_GetItem(__pyx_kwds, __pyx_n_s_transposed); + PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_transposed); if (value) { values[1] = value; kw_args--; } } } if (unlikely(kw_args > 0)) { - if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__init__") < 0)) __PYX_ERR(0, 45, __pyx_L3_error) + if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "__init__") < 0)) __PYX_ERR(0, 47, __pyx_L3_error) } } else { switch (PyTuple_GET_SIZE(__pyx_args)) { @@ -1537,7 +1587,7 @@ static int __pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_1__init__(PyObject *__ } goto __pyx_L4_argument_unpacking_done; __pyx_L5_argtuple_error:; - __Pyx_RaiseArgtupleInvalid("__init__", 0, 1, 2, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 45, __pyx_L3_error) + __Pyx_RaiseArgtupleInvalid("__init__", 0, 1, 2, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 47, __pyx_L3_error) __pyx_L3_error:; __Pyx_AddTraceback("gensim.corpora._mmreader.MmReader.__init__", __pyx_clineno, __pyx_lineno, __pyx_filename); __Pyx_RefNannyFinishContext(); @@ -1551,7 +1601,7 @@ static int __pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_1__init__(PyObject *__ } static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_8__init___2generator1(__pyx_CoroutineObject *__pyx_generator, CYTHON_UNUSED PyThreadState *__pyx_tstate, PyObject *__pyx_sent_value); /* proto */ -/* "gensim/corpora/_mmreader.pyx":75 +/* "gensim/corpora/_mmreader.pyx":77 * line = utils.to_unicode(line) * if not line.startswith('%'): * self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) # <<<<<<<<<<<<<< @@ -1568,7 +1618,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_8__init___genexp if (unlikely(!__pyx_cur_scope)) { __pyx_cur_scope = ((struct __pyx_obj_6gensim_7corpora_9_mmreader___pyx_scope_struct_1_genexpr *)Py_None); __Pyx_INCREF(Py_None); - __PYX_ERR(0, 75, __pyx_L1_error) + __PYX_ERR(0, 77, __pyx_L1_error) } else { __Pyx_GOTREF(__pyx_cur_scope); } @@ -1576,7 +1626,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_8__init___genexp __Pyx_INCREF(((PyObject *)__pyx_cur_scope->__pyx_outer_scope)); __Pyx_GIVEREF(__pyx_cur_scope->__pyx_outer_scope); { - __pyx_CoroutineObject *gen = __Pyx_Generator_New((__pyx_coroutine_body_t) __pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_8__init___2generator1, (PyObject *) __pyx_cur_scope, __pyx_n_s_genexpr, __pyx_n_s_init___locals_genexpr, __pyx_n_s_gensim_corpora__mmreader); if (unlikely(!gen)) __PYX_ERR(0, 75, __pyx_L1_error) + __pyx_CoroutineObject *gen = __Pyx_Generator_New((__pyx_coroutine_body_t) __pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_8__init___2generator1, NULL, (PyObject *) __pyx_cur_scope, __pyx_n_s_genexpr, __pyx_n_s_init___locals_genexpr, __pyx_n_s_gensim_corpora__mmreader); if (unlikely(!gen)) __PYX_ERR(0, 77, __pyx_L1_error) __Pyx_DECREF(__pyx_cur_scope); __Pyx_RefNannyFinishContext(); return (PyObject *) gen; @@ -1611,9 +1661,9 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_8__init___2gener return NULL; } __pyx_L3_first_run:; - if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 75, __pyx_L1_error) - if (unlikely(!__pyx_cur_scope->__pyx_outer_scope->__pyx_v_line)) { __Pyx_RaiseClosureNameError("line"); __PYX_ERR(0, 75, __pyx_L1_error) } - __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_cur_scope->__pyx_outer_scope->__pyx_v_line, __pyx_n_s_split); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 75, __pyx_L1_error) + if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 77, __pyx_L1_error) + if (unlikely(!__pyx_cur_scope->__pyx_outer_scope->__pyx_v_line)) { __Pyx_RaiseClosureNameError("line"); __PYX_ERR(0, 77, __pyx_L1_error) } + __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_cur_scope->__pyx_outer_scope->__pyx_v_line, __pyx_n_s_split); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 77, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __pyx_t_3 = NULL; if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_2))) { @@ -1626,10 +1676,10 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_8__init___2gener } } if (__pyx_t_3) { - __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 75, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 77, __pyx_L1_error) __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; } else { - __pyx_t_1 = __Pyx_PyObject_CallNoArg(__pyx_t_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 75, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_CallNoArg(__pyx_t_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 77, __pyx_L1_error) } __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; @@ -1637,9 +1687,9 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_8__init___2gener __pyx_t_2 = __pyx_t_1; __Pyx_INCREF(__pyx_t_2); __pyx_t_4 = 0; __pyx_t_5 = NULL; } else { - __pyx_t_4 = -1; __pyx_t_2 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 75, __pyx_L1_error) + __pyx_t_4 = -1; __pyx_t_2 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 77, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); - __pyx_t_5 = Py_TYPE(__pyx_t_2)->tp_iternext; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 75, __pyx_L1_error) + __pyx_t_5 = Py_TYPE(__pyx_t_2)->tp_iternext; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 77, __pyx_L1_error) } __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; for (;;) { @@ -1647,17 +1697,17 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_8__init___2gener if (likely(PyList_CheckExact(__pyx_t_2))) { if (__pyx_t_4 >= PyList_GET_SIZE(__pyx_t_2)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_1 = PyList_GET_ITEM(__pyx_t_2, __pyx_t_4); __Pyx_INCREF(__pyx_t_1); __pyx_t_4++; if (unlikely(0 < 0)) __PYX_ERR(0, 75, __pyx_L1_error) + __pyx_t_1 = PyList_GET_ITEM(__pyx_t_2, __pyx_t_4); __Pyx_INCREF(__pyx_t_1); __pyx_t_4++; if (unlikely(0 < 0)) __PYX_ERR(0, 77, __pyx_L1_error) #else - __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_4); __pyx_t_4++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 75, __pyx_L1_error) + __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_4); __pyx_t_4++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 77, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); #endif } else { if (__pyx_t_4 >= PyTuple_GET_SIZE(__pyx_t_2)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_2, __pyx_t_4); __Pyx_INCREF(__pyx_t_1); __pyx_t_4++; if (unlikely(0 < 0)) __PYX_ERR(0, 75, __pyx_L1_error) + __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_2, __pyx_t_4); __Pyx_INCREF(__pyx_t_1); __pyx_t_4++; if (unlikely(0 < 0)) __PYX_ERR(0, 77, __pyx_L1_error) #else - __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_4); __pyx_t_4++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 75, __pyx_L1_error) + __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_4); __pyx_t_4++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 77, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); #endif } @@ -1667,7 +1717,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_8__init___2gener PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else __PYX_ERR(0, 75, __pyx_L1_error) + else __PYX_ERR(0, 77, __pyx_L1_error) } break; } @@ -1677,7 +1727,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_8__init___2gener __Pyx_XDECREF_SET(__pyx_cur_scope->__pyx_v_x, __pyx_t_1); __Pyx_GIVEREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = __Pyx_PyNumber_Int(__pyx_cur_scope->__pyx_v_x); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 75, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyNumber_Int(__pyx_cur_scope->__pyx_v_x); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 77, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_r = __pyx_t_1; __pyx_t_1 = 0; @@ -1697,7 +1747,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_8__init___2gener __Pyx_XGOTREF(__pyx_t_2); __pyx_t_4 = __pyx_cur_scope->__pyx_t_1; __pyx_t_5 = __pyx_cur_scope->__pyx_t_2; - if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 75, __pyx_L1_error) + if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 77, __pyx_L1_error) } __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; CYTHON_MAYBE_UNUSED_VAR(__pyx_cur_scope); @@ -1719,7 +1769,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_8__init___2gener return __pyx_r; } -/* "gensim/corpora/_mmreader.pyx":45 +/* "gensim/corpora/_mmreader.pyx":47 * cdef public long long num_docs, num_terms, num_nnz * * def __init__(self, input, transposed=True): # <<<<<<<<<<<<<< @@ -1762,21 +1812,21 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ if (unlikely(!__pyx_cur_scope)) { __pyx_cur_scope = ((struct __pyx_obj_6gensim_7corpora_9_mmreader___pyx_scope_struct____init__ *)Py_None); __Pyx_INCREF(Py_None); - __PYX_ERR(0, 45, __pyx_L1_error) + __PYX_ERR(0, 47, __pyx_L1_error) } else { __Pyx_GOTREF(__pyx_cur_scope); } - /* "gensim/corpora/_mmreader.pyx":58 + /* "gensim/corpora/_mmreader.pyx":60 * * """ * logger.info("initializing cython corpus reader from %s", input) # <<<<<<<<<<<<<< * self.input, self.transposed = input, transposed * with utils.open_file(self.input) as lines: */ - __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_logger); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 58, __pyx_L1_error) + __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_logger); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 60, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_info); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 58, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_info); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 60, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __pyx_t_2 = NULL; @@ -1794,7 +1844,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[3] = {__pyx_t_2, __pyx_kp_s_initializing_cython_corpus_reade, __pyx_v_input}; - __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_4, 2+__pyx_t_4); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 58, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_4, 2+__pyx_t_4); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 60, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_GOTREF(__pyx_t_1); } else @@ -1802,13 +1852,13 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[3] = {__pyx_t_2, __pyx_kp_s_initializing_cython_corpus_reade, __pyx_v_input}; - __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_4, 2+__pyx_t_4); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 58, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_4, 2+__pyx_t_4); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 60, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_GOTREF(__pyx_t_1); } else #endif { - __pyx_t_5 = PyTuple_New(2+__pyx_t_4); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 58, __pyx_L1_error) + __pyx_t_5 = PyTuple_New(2+__pyx_t_4); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 60, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); if (__pyx_t_2) { __Pyx_GIVEREF(__pyx_t_2); PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_2); __pyx_t_2 = NULL; @@ -1819,14 +1869,14 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __Pyx_INCREF(__pyx_v_input); __Pyx_GIVEREF(__pyx_v_input); PyTuple_SET_ITEM(__pyx_t_5, 1+__pyx_t_4, __pyx_v_input); - __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_5, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 58, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_5, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 60, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; } __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - /* "gensim/corpora/_mmreader.pyx":59 + /* "gensim/corpora/_mmreader.pyx":61 * """ * logger.info("initializing cython corpus reader from %s", input) * self.input, self.transposed = input, transposed # <<<<<<<<<<<<<< @@ -1835,7 +1885,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ */ __pyx_t_1 = __pyx_v_input; __Pyx_INCREF(__pyx_t_1); - __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_v_transposed); if (unlikely((__pyx_t_6 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 59, __pyx_L1_error) + __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_v_transposed); if (unlikely((__pyx_t_6 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 61, __pyx_L1_error) __Pyx_GIVEREF(__pyx_t_1); __Pyx_GOTREF(__pyx_v_self->input); __Pyx_DECREF(__pyx_v_self->input); @@ -1843,7 +1893,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __pyx_t_1 = 0; __pyx_v_self->transposed = __pyx_t_6; - /* "gensim/corpora/_mmreader.pyx":60 + /* "gensim/corpora/_mmreader.pyx":62 * logger.info("initializing cython corpus reader from %s", input) * self.input, self.transposed = input, transposed * with utils.open_file(self.input) as lines: # <<<<<<<<<<<<<< @@ -1851,9 +1901,9 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ * header = utils.to_unicode(next(lines)).strip() */ /*with:*/ { - __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_utils); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 60, __pyx_L1_error) + __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_utils); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 62, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_open_file); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 60, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_open_file); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 62, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __pyx_t_3 = NULL; @@ -1867,13 +1917,13 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ } } if (!__pyx_t_3) { - __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_v_self->input); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 60, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_v_self->input); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 62, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_5)) { PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_self->input}; - __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 60, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 62, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_GOTREF(__pyx_t_1); } else @@ -1881,27 +1931,27 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_5)) { PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_self->input}; - __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 60, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 62, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_GOTREF(__pyx_t_1); } else #endif { - __pyx_t_2 = PyTuple_New(1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 60, __pyx_L1_error) + __pyx_t_2 = PyTuple_New(1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 62, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_3); __pyx_t_3 = NULL; __Pyx_INCREF(__pyx_v_self->input); __Pyx_GIVEREF(__pyx_v_self->input); PyTuple_SET_ITEM(__pyx_t_2, 0+1, __pyx_v_self->input); - __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_2, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 60, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_2, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 62, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; } } __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; - __pyx_t_7 = __Pyx_PyObject_LookupSpecial(__pyx_t_1, __pyx_n_s_exit); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 60, __pyx_L1_error) + __pyx_t_7 = __Pyx_PyObject_LookupSpecial(__pyx_t_1, __pyx_n_s_exit); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 62, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_7); - __pyx_t_2 = __Pyx_PyObject_LookupSpecial(__pyx_t_1, __pyx_n_s_enter); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 60, __pyx_L3_error) + __pyx_t_2 = __Pyx_PyObject_LookupSpecial(__pyx_t_1, __pyx_n_s_enter); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 62, __pyx_L3_error) __Pyx_GOTREF(__pyx_t_2); __pyx_t_3 = NULL; if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_2))) { @@ -1914,10 +1964,10 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ } } if (__pyx_t_3) { - __pyx_t_5 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 60, __pyx_L3_error) + __pyx_t_5 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 62, __pyx_L3_error) __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; } else { - __pyx_t_5 = __Pyx_PyObject_CallNoArg(__pyx_t_2); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 60, __pyx_L3_error) + __pyx_t_5 = __Pyx_PyObject_CallNoArg(__pyx_t_2); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 62, __pyx_L3_error) } __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; @@ -1936,7 +1986,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __pyx_v_lines = __pyx_t_2; __pyx_t_2 = 0; - /* "gensim/corpora/_mmreader.pyx":61 + /* "gensim/corpora/_mmreader.pyx":63 * self.input, self.transposed = input, transposed * with utils.open_file(self.input) as lines: * try: # <<<<<<<<<<<<<< @@ -1952,19 +2002,19 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __Pyx_XGOTREF(__pyx_t_13); /*try:*/ { - /* "gensim/corpora/_mmreader.pyx":62 + /* "gensim/corpora/_mmreader.pyx":64 * with utils.open_file(self.input) as lines: * try: * header = utils.to_unicode(next(lines)).strip() # <<<<<<<<<<<<<< * if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): * raise ValueError( */ - __pyx_t_5 = __Pyx_GetModuleGlobalName(__pyx_n_s_utils); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 62, __pyx_L13_error) + __pyx_t_5 = __Pyx_GetModuleGlobalName(__pyx_n_s_utils); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 64, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_5); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_to_unicode); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 62, __pyx_L13_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_to_unicode); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 64, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; - __pyx_t_5 = __Pyx_PyIter_Next(__pyx_v_lines); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 62, __pyx_L13_error) + __pyx_t_5 = __Pyx_PyIter_Next(__pyx_v_lines); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 64, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_5); __pyx_t_14 = NULL; if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) { @@ -1977,14 +2027,14 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ } } if (!__pyx_t_14) { - __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_5); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 62, __pyx_L13_error) + __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_5); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 64, __pyx_L13_error) __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; __Pyx_GOTREF(__pyx_t_1); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[2] = {__pyx_t_14, __pyx_t_5}; - __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 62, __pyx_L13_error) + __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 64, __pyx_L13_error) __Pyx_XDECREF(__pyx_t_14); __pyx_t_14 = 0; __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; @@ -1993,26 +2043,26 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[2] = {__pyx_t_14, __pyx_t_5}; - __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 62, __pyx_L13_error) + __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 64, __pyx_L13_error) __Pyx_XDECREF(__pyx_t_14); __pyx_t_14 = 0; __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; } else #endif { - __pyx_t_15 = PyTuple_New(1+1); if (unlikely(!__pyx_t_15)) __PYX_ERR(0, 62, __pyx_L13_error) + __pyx_t_15 = PyTuple_New(1+1); if (unlikely(!__pyx_t_15)) __PYX_ERR(0, 64, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_15); __Pyx_GIVEREF(__pyx_t_14); PyTuple_SET_ITEM(__pyx_t_15, 0, __pyx_t_14); __pyx_t_14 = NULL; __Pyx_GIVEREF(__pyx_t_5); PyTuple_SET_ITEM(__pyx_t_15, 0+1, __pyx_t_5); __pyx_t_5 = 0; - __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_15, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 62, __pyx_L13_error) + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_15, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 64, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_15); __pyx_t_15 = 0; } } __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_strip); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 62, __pyx_L13_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_strip); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 64, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_t_1 = NULL; @@ -2026,24 +2076,24 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ } } if (__pyx_t_1) { - __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 62, __pyx_L13_error) + __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 64, __pyx_L13_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; } else { - __pyx_t_2 = __Pyx_PyObject_CallNoArg(__pyx_t_3); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 62, __pyx_L13_error) + __pyx_t_2 = __Pyx_PyObject_CallNoArg(__pyx_t_3); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 64, __pyx_L13_error) } __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __pyx_v_header = __pyx_t_2; __pyx_t_2 = 0; - /* "gensim/corpora/_mmreader.pyx":63 + /* "gensim/corpora/_mmreader.pyx":65 * try: * header = utils.to_unicode(next(lines)).strip() * if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): # <<<<<<<<<<<<<< * raise ValueError( * "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % */ - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_header, __pyx_n_s_lower); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 63, __pyx_L13_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_v_header, __pyx_n_s_lower); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 65, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_3); __pyx_t_1 = NULL; if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_3))) { @@ -2056,32 +2106,32 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ } } if (__pyx_t_1) { - __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 63, __pyx_L13_error) + __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 65, __pyx_L13_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; } else { - __pyx_t_2 = __Pyx_PyObject_CallNoArg(__pyx_t_3); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 63, __pyx_L13_error) + __pyx_t_2 = __Pyx_PyObject_CallNoArg(__pyx_t_3); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 65, __pyx_L13_error) } __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_startswith); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 63, __pyx_L13_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_startswith); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 65, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_tuple_, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 63, __pyx_L13_error) + __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_tuple_, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 65, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_2); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(0, 63, __pyx_L13_error) + __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_2); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(0, 65, __pyx_L13_error) __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __pyx_t_16 = ((!__pyx_t_6) != 0); - if (__pyx_t_16) { + if (unlikely(__pyx_t_16)) { - /* "gensim/corpora/_mmreader.pyx":66 + /* "gensim/corpora/_mmreader.pyx":68 * raise ValueError( * "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % * (self.input, header) # <<<<<<<<<<<<<< * ) * except StopIteration: */ - __pyx_t_2 = PyTuple_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 66, __pyx_L13_error) + __pyx_t_2 = PyTuple_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 68, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_INCREF(__pyx_v_self->input); __Pyx_GIVEREF(__pyx_v_self->input); @@ -2090,37 +2140,32 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __Pyx_GIVEREF(__pyx_v_header); PyTuple_SET_ITEM(__pyx_t_2, 1, __pyx_v_header); - /* "gensim/corpora/_mmreader.pyx":65 + /* "gensim/corpora/_mmreader.pyx":67 * if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): * raise ValueError( * "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % # <<<<<<<<<<<<<< * (self.input, header) * ) */ - __pyx_t_3 = __Pyx_PyString_Format(__pyx_kp_s_File_s_not_in_Matrix_Market_form, __pyx_t_2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 65, __pyx_L13_error) + __pyx_t_3 = __Pyx_PyString_Format(__pyx_kp_s_File_s_not_in_Matrix_Market_form, __pyx_t_2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 67, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - /* "gensim/corpora/_mmreader.pyx":64 + /* "gensim/corpora/_mmreader.pyx":66 * header = utils.to_unicode(next(lines)).strip() * if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): * raise ValueError( # <<<<<<<<<<<<<< * "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % * (self.input, header) */ - __pyx_t_2 = PyTuple_New(1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 64, __pyx_L13_error) + __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_builtin_ValueError, __pyx_t_3); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 66, __pyx_L13_error) __Pyx_GOTREF(__pyx_t_2); - __Pyx_GIVEREF(__pyx_t_3); - PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_3); - __pyx_t_3 = 0; - __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_t_2, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 64, __pyx_L13_error) - __Pyx_GOTREF(__pyx_t_3); - __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - __Pyx_Raise(__pyx_t_3, 0, 0, 0); __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __PYX_ERR(0, 64, __pyx_L13_error) + __Pyx_Raise(__pyx_t_2, 0, 0, 0); + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __PYX_ERR(0, 66, __pyx_L13_error) - /* "gensim/corpora/_mmreader.pyx":63 + /* "gensim/corpora/_mmreader.pyx":65 * try: * header = utils.to_unicode(next(lines)).strip() * if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): # <<<<<<<<<<<<<< @@ -2129,7 +2174,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ */ } - /* "gensim/corpora/_mmreader.pyx":61 + /* "gensim/corpora/_mmreader.pyx":63 * self.input, self.transposed = input, transposed * with utils.open_file(self.input) as lines: * try: # <<<<<<<<<<<<<< @@ -2146,10 +2191,10 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0; __Pyx_XDECREF(__pyx_t_15); __pyx_t_15 = 0; __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0; - __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0; + __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; - /* "gensim/corpora/_mmreader.pyx":68 + /* "gensim/corpora/_mmreader.pyx":70 * (self.input, header) * ) * except StopIteration: # <<<<<<<<<<<<<< @@ -2164,7 +2209,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ goto __pyx_L15_except_error; __pyx_L15_except_error:; - /* "gensim/corpora/_mmreader.pyx":61 + /* "gensim/corpora/_mmreader.pyx":63 * self.input, self.transposed = input, transposed * with utils.open_file(self.input) as lines: * try: # <<<<<<<<<<<<<< @@ -2184,7 +2229,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __pyx_L18_try_end:; } - /* "gensim/corpora/_mmreader.pyx":71 + /* "gensim/corpora/_mmreader.pyx":73 * pass * * self.num_docs = self.num_terms = self.num_nnz = 0 # <<<<<<<<<<<<<< @@ -2195,7 +2240,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __pyx_v_self->num_terms = 0; __pyx_v_self->num_nnz = 0; - /* "gensim/corpora/_mmreader.pyx":72 + /* "gensim/corpora/_mmreader.pyx":74 * * self.num_docs = self.num_terms = self.num_nnz = 0 * for lineno, line in enumerate(lines): # <<<<<<<<<<<<<< @@ -2203,41 +2248,41 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ * if not line.startswith('%'): */ __Pyx_INCREF(__pyx_int_0); - __pyx_t_3 = __pyx_int_0; + __pyx_t_2 = __pyx_int_0; if (likely(PyList_CheckExact(__pyx_v_lines)) || PyTuple_CheckExact(__pyx_v_lines)) { - __pyx_t_2 = __pyx_v_lines; __Pyx_INCREF(__pyx_t_2); __pyx_t_17 = 0; + __pyx_t_3 = __pyx_v_lines; __Pyx_INCREF(__pyx_t_3); __pyx_t_17 = 0; __pyx_t_18 = NULL; } else { - __pyx_t_17 = -1; __pyx_t_2 = PyObject_GetIter(__pyx_v_lines); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 72, __pyx_L7_error) - __Pyx_GOTREF(__pyx_t_2); - __pyx_t_18 = Py_TYPE(__pyx_t_2)->tp_iternext; if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 72, __pyx_L7_error) + __pyx_t_17 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_v_lines); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 74, __pyx_L7_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_18 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 74, __pyx_L7_error) } for (;;) { if (likely(!__pyx_t_18)) { - if (likely(PyList_CheckExact(__pyx_t_2))) { - if (__pyx_t_17 >= PyList_GET_SIZE(__pyx_t_2)) break; + if (likely(PyList_CheckExact(__pyx_t_3))) { + if (__pyx_t_17 >= PyList_GET_SIZE(__pyx_t_3)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_1 = PyList_GET_ITEM(__pyx_t_2, __pyx_t_17); __Pyx_INCREF(__pyx_t_1); __pyx_t_17++; if (unlikely(0 < 0)) __PYX_ERR(0, 72, __pyx_L7_error) + __pyx_t_1 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_17); __Pyx_INCREF(__pyx_t_1); __pyx_t_17++; if (unlikely(0 < 0)) __PYX_ERR(0, 74, __pyx_L7_error) #else - __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_17); __pyx_t_17++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 72, __pyx_L7_error) + __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_17); __pyx_t_17++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 74, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_1); #endif } else { - if (__pyx_t_17 >= PyTuple_GET_SIZE(__pyx_t_2)) break; + if (__pyx_t_17 >= PyTuple_GET_SIZE(__pyx_t_3)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_2, __pyx_t_17); __Pyx_INCREF(__pyx_t_1); __pyx_t_17++; if (unlikely(0 < 0)) __PYX_ERR(0, 72, __pyx_L7_error) + __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_17); __Pyx_INCREF(__pyx_t_1); __pyx_t_17++; if (unlikely(0 < 0)) __PYX_ERR(0, 74, __pyx_L7_error) #else - __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_17); __pyx_t_17++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 72, __pyx_L7_error) + __pyx_t_1 = PySequence_ITEM(__pyx_t_3, __pyx_t_17); __pyx_t_17++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 74, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_1); #endif } } else { - __pyx_t_1 = __pyx_t_18(__pyx_t_2); + __pyx_t_1 = __pyx_t_18(__pyx_t_3); if (unlikely(!__pyx_t_1)) { PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else __PYX_ERR(0, 72, __pyx_L7_error) + else __PYX_ERR(0, 74, __pyx_L7_error) } break; } @@ -2247,24 +2292,24 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __Pyx_XDECREF_SET(__pyx_cur_scope->__pyx_v_line, __pyx_t_1); __Pyx_GIVEREF(__pyx_t_1); __pyx_t_1 = 0; - __Pyx_INCREF(__pyx_t_3); - __Pyx_XDECREF_SET(__pyx_v_lineno, __pyx_t_3); - __pyx_t_1 = __Pyx_PyInt_AddObjC(__pyx_t_3, __pyx_int_1, 1, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 72, __pyx_L7_error) + __Pyx_INCREF(__pyx_t_2); + __Pyx_XDECREF_SET(__pyx_v_lineno, __pyx_t_2); + __pyx_t_1 = __Pyx_PyInt_AddObjC(__pyx_t_2, __pyx_int_1, 1, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 74, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_1); - __Pyx_DECREF(__pyx_t_3); - __pyx_t_3 = __pyx_t_1; + __Pyx_DECREF(__pyx_t_2); + __pyx_t_2 = __pyx_t_1; __pyx_t_1 = 0; - /* "gensim/corpora/_mmreader.pyx":73 + /* "gensim/corpora/_mmreader.pyx":75 * self.num_docs = self.num_terms = self.num_nnz = 0 * for lineno, line in enumerate(lines): * line = utils.to_unicode(line) # <<<<<<<<<<<<<< * if not line.startswith('%'): * self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) */ - __pyx_t_15 = __Pyx_GetModuleGlobalName(__pyx_n_s_utils); if (unlikely(!__pyx_t_15)) __PYX_ERR(0, 73, __pyx_L7_error) + __pyx_t_15 = __Pyx_GetModuleGlobalName(__pyx_n_s_utils); if (unlikely(!__pyx_t_15)) __PYX_ERR(0, 75, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_15); - __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_15, __pyx_n_s_to_unicode); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 73, __pyx_L7_error) + __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_15, __pyx_n_s_to_unicode); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 75, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_15); __pyx_t_15 = 0; __pyx_t_15 = NULL; @@ -2278,13 +2323,13 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ } } if (!__pyx_t_15) { - __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_cur_scope->__pyx_v_line); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 73, __pyx_L7_error) + __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_cur_scope->__pyx_v_line); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 75, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_1); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_5)) { PyObject *__pyx_temp[2] = {__pyx_t_15, __pyx_cur_scope->__pyx_v_line}; - __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 73, __pyx_L7_error) + __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 75, __pyx_L7_error) __Pyx_XDECREF(__pyx_t_15); __pyx_t_15 = 0; __Pyx_GOTREF(__pyx_t_1); } else @@ -2292,19 +2337,19 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_5)) { PyObject *__pyx_temp[2] = {__pyx_t_15, __pyx_cur_scope->__pyx_v_line}; - __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 73, __pyx_L7_error) + __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 75, __pyx_L7_error) __Pyx_XDECREF(__pyx_t_15); __pyx_t_15 = 0; __Pyx_GOTREF(__pyx_t_1); } else #endif { - __pyx_t_14 = PyTuple_New(1+1); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 73, __pyx_L7_error) + __pyx_t_14 = PyTuple_New(1+1); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 75, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_14); __Pyx_GIVEREF(__pyx_t_15); PyTuple_SET_ITEM(__pyx_t_14, 0, __pyx_t_15); __pyx_t_15 = NULL; __Pyx_INCREF(__pyx_cur_scope->__pyx_v_line); __Pyx_GIVEREF(__pyx_cur_scope->__pyx_v_line); PyTuple_SET_ITEM(__pyx_t_14, 0+1, __pyx_cur_scope->__pyx_v_line); - __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_14, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 73, __pyx_L7_error) + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_14, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 75, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_14); __pyx_t_14 = 0; } @@ -2315,43 +2360,39 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __Pyx_GIVEREF(__pyx_t_1); __pyx_t_1 = 0; - /* "gensim/corpora/_mmreader.pyx":74 + /* "gensim/corpora/_mmreader.pyx":76 * for lineno, line in enumerate(lines): * line = utils.to_unicode(line) * if not line.startswith('%'): # <<<<<<<<<<<<<< * self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) * if not self.transposed: */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_cur_scope->__pyx_v_line, __pyx_n_s_startswith); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 74, __pyx_L7_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_cur_scope->__pyx_v_line, __pyx_n_s_startswith); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 76, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_tuple__3, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 74, __pyx_L7_error) + __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_tuple__3, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 76, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_16 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_16 < 0)) __PYX_ERR(0, 74, __pyx_L7_error) + __pyx_t_16 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_16 < 0)) __PYX_ERR(0, 76, __pyx_L7_error) __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; __pyx_t_6 = ((!__pyx_t_16) != 0); if (__pyx_t_6) { - /* "gensim/corpora/_mmreader.pyx":75 + /* "gensim/corpora/_mmreader.pyx":77 * line = utils.to_unicode(line) * if not line.startswith('%'): * self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) # <<<<<<<<<<<<<< * if not self.transposed: * self.num_docs, self.num_terms = self.num_terms, self.num_docs */ - __pyx_t_5 = __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_8__init___genexpr(((PyObject*)__pyx_cur_scope)); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 75, __pyx_L7_error) + __pyx_t_5 = __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_8__init___genexpr(((PyObject*)__pyx_cur_scope)); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 77, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_5); if ((likely(PyTuple_CheckExact(__pyx_t_5))) || (PyList_CheckExact(__pyx_t_5))) { PyObject* sequence = __pyx_t_5; - #if !CYTHON_COMPILING_IN_PYPY - Py_ssize_t size = Py_SIZE(sequence); - #else - Py_ssize_t size = PySequence_Size(sequence); - #endif + Py_ssize_t size = __Pyx_PySequence_SIZE(sequence); if (unlikely(size != 3)) { if (size > 3) __Pyx_RaiseTooManyValuesError(3); else if (size >= 0) __Pyx_RaiseNeedMoreValuesError(size); - __PYX_ERR(0, 75, __pyx_L7_error) + __PYX_ERR(0, 77, __pyx_L7_error) } #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS if (likely(PyTuple_CheckExact(sequence))) { @@ -2367,17 +2408,17 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __Pyx_INCREF(__pyx_t_14); __Pyx_INCREF(__pyx_t_15); #else - __pyx_t_1 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 75, __pyx_L7_error) + __pyx_t_1 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 77, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_14 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 75, __pyx_L7_error) + __pyx_t_14 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 77, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_14); - __pyx_t_15 = PySequence_ITEM(sequence, 2); if (unlikely(!__pyx_t_15)) __PYX_ERR(0, 75, __pyx_L7_error) + __pyx_t_15 = PySequence_ITEM(sequence, 2); if (unlikely(!__pyx_t_15)) __PYX_ERR(0, 77, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_15); #endif __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; } else { Py_ssize_t index = -1; - __pyx_t_19 = PyObject_GetIter(__pyx_t_5); if (unlikely(!__pyx_t_19)) __PYX_ERR(0, 75, __pyx_L7_error) + __pyx_t_19 = PyObject_GetIter(__pyx_t_5); if (unlikely(!__pyx_t_19)) __PYX_ERR(0, 77, __pyx_L7_error) __Pyx_GOTREF(__pyx_t_19); __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; __pyx_t_20 = Py_TYPE(__pyx_t_19)->tp_iternext; @@ -2387,7 +2428,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __Pyx_GOTREF(__pyx_t_14); index = 2; __pyx_t_15 = __pyx_t_20(__pyx_t_19); if (unlikely(!__pyx_t_15)) goto __pyx_L23_unpacking_failed; __Pyx_GOTREF(__pyx_t_15); - if (__Pyx_IternextUnpackEndCheck(__pyx_t_20(__pyx_t_19), 3) < 0) __PYX_ERR(0, 75, __pyx_L7_error) + if (__Pyx_IternextUnpackEndCheck(__pyx_t_20(__pyx_t_19), 3) < 0) __PYX_ERR(0, 77, __pyx_L7_error) __pyx_t_20 = NULL; __Pyx_DECREF(__pyx_t_19); __pyx_t_19 = 0; goto __pyx_L24_unpacking_done; @@ -2395,20 +2436,20 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __Pyx_DECREF(__pyx_t_19); __pyx_t_19 = 0; __pyx_t_20 = NULL; if (__Pyx_IterFinish() == 0) __Pyx_RaiseNeedMoreValuesError(index); - __PYX_ERR(0, 75, __pyx_L7_error) + __PYX_ERR(0, 77, __pyx_L7_error) __pyx_L24_unpacking_done:; } - __pyx_t_21 = __Pyx_PyInt_As_PY_LONG_LONG(__pyx_t_1); if (unlikely((__pyx_t_21 == (PY_LONG_LONG)-1) && PyErr_Occurred())) __PYX_ERR(0, 75, __pyx_L7_error) + __pyx_t_21 = __Pyx_PyInt_As_PY_LONG_LONG(__pyx_t_1); if (unlikely((__pyx_t_21 == (PY_LONG_LONG)-1) && PyErr_Occurred())) __PYX_ERR(0, 77, __pyx_L7_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_22 = __Pyx_PyInt_As_PY_LONG_LONG(__pyx_t_14); if (unlikely((__pyx_t_22 == (PY_LONG_LONG)-1) && PyErr_Occurred())) __PYX_ERR(0, 75, __pyx_L7_error) + __pyx_t_22 = __Pyx_PyInt_As_PY_LONG_LONG(__pyx_t_14); if (unlikely((__pyx_t_22 == (PY_LONG_LONG)-1) && PyErr_Occurred())) __PYX_ERR(0, 77, __pyx_L7_error) __Pyx_DECREF(__pyx_t_14); __pyx_t_14 = 0; - __pyx_t_23 = __Pyx_PyInt_As_PY_LONG_LONG(__pyx_t_15); if (unlikely((__pyx_t_23 == (PY_LONG_LONG)-1) && PyErr_Occurred())) __PYX_ERR(0, 75, __pyx_L7_error) + __pyx_t_23 = __Pyx_PyInt_As_PY_LONG_LONG(__pyx_t_15); if (unlikely((__pyx_t_23 == (PY_LONG_LONG)-1) && PyErr_Occurred())) __PYX_ERR(0, 77, __pyx_L7_error) __Pyx_DECREF(__pyx_t_15); __pyx_t_15 = 0; __pyx_v_self->num_docs = __pyx_t_21; __pyx_v_self->num_terms = __pyx_t_22; __pyx_v_self->num_nnz = __pyx_t_23; - /* "gensim/corpora/_mmreader.pyx":76 + /* "gensim/corpora/_mmreader.pyx":78 * if not line.startswith('%'): * self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) * if not self.transposed: # <<<<<<<<<<<<<< @@ -2418,7 +2459,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __pyx_t_6 = ((!(__pyx_v_self->transposed != 0)) != 0); if (__pyx_t_6) { - /* "gensim/corpora/_mmreader.pyx":77 + /* "gensim/corpora/_mmreader.pyx":79 * self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) * if not self.transposed: * self.num_docs, self.num_terms = self.num_terms, self.num_docs # <<<<<<<<<<<<<< @@ -2430,7 +2471,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __pyx_v_self->num_docs = __pyx_t_23; __pyx_v_self->num_terms = __pyx_t_22; - /* "gensim/corpora/_mmreader.pyx":76 + /* "gensim/corpora/_mmreader.pyx":78 * if not line.startswith('%'): * self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) * if not self.transposed: # <<<<<<<<<<<<<< @@ -2439,7 +2480,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ */ } - /* "gensim/corpora/_mmreader.pyx":78 + /* "gensim/corpora/_mmreader.pyx":80 * if not self.transposed: * self.num_docs, self.num_terms = self.num_terms, self.num_docs * break # <<<<<<<<<<<<<< @@ -2448,7 +2489,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ */ goto __pyx_L21_break; - /* "gensim/corpora/_mmreader.pyx":74 + /* "gensim/corpora/_mmreader.pyx":76 * for lineno, line in enumerate(lines): * line = utils.to_unicode(line) * if not line.startswith('%'): # <<<<<<<<<<<<<< @@ -2457,7 +2498,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ */ } - /* "gensim/corpora/_mmreader.pyx":72 + /* "gensim/corpora/_mmreader.pyx":74 * * self.num_docs = self.num_terms = self.num_nnz = 0 * for lineno, line in enumerate(lines): # <<<<<<<<<<<<<< @@ -2466,10 +2507,10 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ */ } __pyx_L21_break:; - __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - /* "gensim/corpora/_mmreader.pyx":60 + /* "gensim/corpora/_mmreader.pyx":62 * logger.info("initializing cython corpus reader from %s", input) * self.input, self.transposed = input, transposed * with utils.open_file(self.input) as lines: # <<<<<<<<<<<<<< @@ -2487,35 +2528,35 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __Pyx_XDECREF(__pyx_t_14); __pyx_t_14 = 0; __Pyx_XDECREF(__pyx_t_15); __pyx_t_15 = 0; __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0; - __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0; + __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; /*except:*/ { __Pyx_AddTraceback("gensim.corpora._mmreader.MmReader.__init__", __pyx_clineno, __pyx_lineno, __pyx_filename); - if (__Pyx_GetException(&__pyx_t_3, &__pyx_t_2, &__pyx_t_5) < 0) __PYX_ERR(0, 60, __pyx_L9_except_error) - __Pyx_GOTREF(__pyx_t_3); + if (__Pyx_GetException(&__pyx_t_2, &__pyx_t_3, &__pyx_t_5) < 0) __PYX_ERR(0, 62, __pyx_L9_except_error) __Pyx_GOTREF(__pyx_t_2); + __Pyx_GOTREF(__pyx_t_3); __Pyx_GOTREF(__pyx_t_5); - __pyx_t_15 = PyTuple_Pack(3, __pyx_t_3, __pyx_t_2, __pyx_t_5); if (unlikely(!__pyx_t_15)) __PYX_ERR(0, 60, __pyx_L9_except_error) + __pyx_t_15 = PyTuple_Pack(3, __pyx_t_2, __pyx_t_3, __pyx_t_5); if (unlikely(!__pyx_t_15)) __PYX_ERR(0, 62, __pyx_L9_except_error) __Pyx_GOTREF(__pyx_t_15); __pyx_t_13 = __Pyx_PyObject_Call(__pyx_t_7, __pyx_t_15, NULL); __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; __Pyx_DECREF(__pyx_t_15); __pyx_t_15 = 0; - if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 60, __pyx_L9_except_error) + if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 62, __pyx_L9_except_error) __Pyx_GOTREF(__pyx_t_13); __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_13); __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; - if (__pyx_t_6 < 0) __PYX_ERR(0, 60, __pyx_L9_except_error) + if (__pyx_t_6 < 0) __PYX_ERR(0, 62, __pyx_L9_except_error) __pyx_t_16 = ((!(__pyx_t_6 != 0)) != 0); if (__pyx_t_16) { - __Pyx_GIVEREF(__pyx_t_3); __Pyx_GIVEREF(__pyx_t_2); + __Pyx_GIVEREF(__pyx_t_3); __Pyx_XGIVEREF(__pyx_t_5); - __Pyx_ErrRestoreWithState(__pyx_t_3, __pyx_t_2, __pyx_t_5); - __pyx_t_3 = 0; __pyx_t_2 = 0; __pyx_t_5 = 0; - __PYX_ERR(0, 60, __pyx_L9_except_error) + __Pyx_ErrRestoreWithState(__pyx_t_2, __pyx_t_3, __pyx_t_5); + __pyx_t_2 = 0; __pyx_t_3 = 0; __pyx_t_5 = 0; + __PYX_ERR(0, 62, __pyx_L9_except_error) } - __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; goto __pyx_L8_exception_handled; } @@ -2538,7 +2579,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ if (__pyx_t_7) { __pyx_t_10 = __Pyx_PyObject_Call(__pyx_t_7, __pyx_tuple__4, NULL); __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; - if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 60, __pyx_L1_error) + if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 62, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_10); __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; } @@ -2553,68 +2594,68 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __pyx_L29:; } - /* "gensim/corpora/_mmreader.pyx":80 + /* "gensim/corpora/_mmreader.pyx":82 * break * * logger.info( # <<<<<<<<<<<<<< * "accepted corpus with %i documents, %i features, %i non-zero entries", * self.num_docs, self.num_terms, self.num_nnz */ - __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_logger); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 80, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_2); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_info); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 80, __pyx_L1_error) + __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_logger); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 82, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_info); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 82, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - /* "gensim/corpora/_mmreader.pyx":82 + /* "gensim/corpora/_mmreader.pyx":84 * logger.info( * "accepted corpus with %i documents, %i features, %i non-zero entries", * self.num_docs, self.num_terms, self.num_nnz # <<<<<<<<<<<<<< * ) * */ - __pyx_t_2 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_v_self->num_docs); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 82, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_2); - __pyx_t_15 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_v_self->num_terms); if (unlikely(!__pyx_t_15)) __PYX_ERR(0, 82, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_v_self->num_docs); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 84, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __pyx_t_15 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_v_self->num_terms); if (unlikely(!__pyx_t_15)) __PYX_ERR(0, 84, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_15); - __pyx_t_14 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_v_self->num_nnz); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 82, __pyx_L1_error) + __pyx_t_14 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_v_self->num_nnz); if (unlikely(!__pyx_t_14)) __PYX_ERR(0, 84, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_14); __pyx_t_1 = NULL; __pyx_t_4 = 0; - if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) { - __pyx_t_1 = PyMethod_GET_SELF(__pyx_t_3); + if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) { + __pyx_t_1 = PyMethod_GET_SELF(__pyx_t_2); if (likely(__pyx_t_1)) { - PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3); + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2); __Pyx_INCREF(__pyx_t_1); __Pyx_INCREF(function); - __Pyx_DECREF_SET(__pyx_t_3, function); + __Pyx_DECREF_SET(__pyx_t_2, function); __pyx_t_4 = 1; } } #if CYTHON_FAST_PYCALL - if (PyFunction_Check(__pyx_t_3)) { - PyObject *__pyx_temp[5] = {__pyx_t_1, __pyx_kp_s_accepted_corpus_with_i_documents, __pyx_t_2, __pyx_t_15, __pyx_t_14}; - __pyx_t_5 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_4, 4+__pyx_t_4); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 80, __pyx_L1_error) + if (PyFunction_Check(__pyx_t_2)) { + PyObject *__pyx_temp[5] = {__pyx_t_1, __pyx_kp_s_accepted_corpus_with_i_documents, __pyx_t_3, __pyx_t_15, __pyx_t_14}; + __pyx_t_5 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-__pyx_t_4, 4+__pyx_t_4); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 82, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_GOTREF(__pyx_t_5); - __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_DECREF(__pyx_t_15); __pyx_t_15 = 0; __Pyx_DECREF(__pyx_t_14); __pyx_t_14 = 0; } else #endif #if CYTHON_FAST_PYCCALL - if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) { - PyObject *__pyx_temp[5] = {__pyx_t_1, __pyx_kp_s_accepted_corpus_with_i_documents, __pyx_t_2, __pyx_t_15, __pyx_t_14}; - __pyx_t_5 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_4, 4+__pyx_t_4); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 80, __pyx_L1_error) + if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) { + PyObject *__pyx_temp[5] = {__pyx_t_1, __pyx_kp_s_accepted_corpus_with_i_documents, __pyx_t_3, __pyx_t_15, __pyx_t_14}; + __pyx_t_5 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-__pyx_t_4, 4+__pyx_t_4); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 82, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_GOTREF(__pyx_t_5); - __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_DECREF(__pyx_t_15); __pyx_t_15 = 0; __Pyx_DECREF(__pyx_t_14); __pyx_t_14 = 0; } else #endif { - __pyx_t_19 = PyTuple_New(4+__pyx_t_4); if (unlikely(!__pyx_t_19)) __PYX_ERR(0, 80, __pyx_L1_error) + __pyx_t_19 = PyTuple_New(4+__pyx_t_4); if (unlikely(!__pyx_t_19)) __PYX_ERR(0, 82, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_19); if (__pyx_t_1) { __Pyx_GIVEREF(__pyx_t_1); PyTuple_SET_ITEM(__pyx_t_19, 0, __pyx_t_1); __pyx_t_1 = NULL; @@ -2622,23 +2663,23 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ __Pyx_INCREF(__pyx_kp_s_accepted_corpus_with_i_documents); __Pyx_GIVEREF(__pyx_kp_s_accepted_corpus_with_i_documents); PyTuple_SET_ITEM(__pyx_t_19, 0+__pyx_t_4, __pyx_kp_s_accepted_corpus_with_i_documents); - __Pyx_GIVEREF(__pyx_t_2); - PyTuple_SET_ITEM(__pyx_t_19, 1+__pyx_t_4, __pyx_t_2); + __Pyx_GIVEREF(__pyx_t_3); + PyTuple_SET_ITEM(__pyx_t_19, 1+__pyx_t_4, __pyx_t_3); __Pyx_GIVEREF(__pyx_t_15); PyTuple_SET_ITEM(__pyx_t_19, 2+__pyx_t_4, __pyx_t_15); __Pyx_GIVEREF(__pyx_t_14); PyTuple_SET_ITEM(__pyx_t_19, 3+__pyx_t_4, __pyx_t_14); - __pyx_t_2 = 0; + __pyx_t_3 = 0; __pyx_t_15 = 0; __pyx_t_14 = 0; - __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_19, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 80, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_19, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 82, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_19); __pyx_t_19 = 0; } - __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; - /* "gensim/corpora/_mmreader.pyx":45 + /* "gensim/corpora/_mmreader.pyx":47 * cdef public long long num_docs, num_terms, num_nnz * * def __init__(self, input, transposed=True): # <<<<<<<<<<<<<< @@ -2668,17 +2709,17 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader___init__(struct __pyx_ return __pyx_r; } -/* "gensim/corpora/_mmreader.pyx":85 +/* "gensim/corpora/_mmreader.pyx":87 * ) * * def __len__(self): # <<<<<<<<<<<<<< - * """Get size of corpus (number of documents).""" + * """Get the corpus size: total number of documents.""" * return self.num_docs */ /* Python wrapper */ static Py_ssize_t __pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_3__len__(PyObject *__pyx_v_self); /*proto*/ -static char __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_2__len__[] = "Get size of corpus (number of documents)."; +static char __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_2__len__[] = "Get the corpus size: total number of documents."; #if CYTHON_COMPILING_IN_CPYTHON struct wrapperbase __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader_2__len__; #endif @@ -2698,9 +2739,9 @@ static Py_ssize_t __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_2__len__(struct __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("__len__", 0); - /* "gensim/corpora/_mmreader.pyx":87 + /* "gensim/corpora/_mmreader.pyx":89 * def __len__(self): - * """Get size of corpus (number of documents).""" + * """Get the corpus size: total number of documents.""" * return self.num_docs # <<<<<<<<<<<<<< * * def __str__(self): @@ -2708,11 +2749,11 @@ static Py_ssize_t __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_2__len__(struct __pyx_r = __pyx_v_self->num_docs; goto __pyx_L0; - /* "gensim/corpora/_mmreader.pyx":85 + /* "gensim/corpora/_mmreader.pyx":87 * ) * * def __len__(self): # <<<<<<<<<<<<<< - * """Get size of corpus (number of documents).""" + * """Get the corpus size: total number of documents.""" * return self.num_docs */ @@ -2722,7 +2763,7 @@ static Py_ssize_t __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_2__len__(struct return __pyx_r; } -/* "gensim/corpora/_mmreader.pyx":89 +/* "gensim/corpora/_mmreader.pyx":91 * return self.num_docs * * def __str__(self): # <<<<<<<<<<<<<< @@ -2752,7 +2793,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_4__str__(struct PyObject *__pyx_t_4 = NULL; __Pyx_RefNannySetupContext("__str__", 0); - /* "gensim/corpora/_mmreader.pyx":90 + /* "gensim/corpora/_mmreader.pyx":92 * * def __str__(self): * return ("MmCorpus(%i documents, %i features, %i non-zero entries)" % # <<<<<<<<<<<<<< @@ -2761,20 +2802,20 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_4__str__(struct */ __Pyx_XDECREF(__pyx_r); - /* "gensim/corpora/_mmreader.pyx":91 + /* "gensim/corpora/_mmreader.pyx":93 * def __str__(self): * return ("MmCorpus(%i documents, %i features, %i non-zero entries)" % * (self.num_docs, self.num_terms, self.num_nnz)) # <<<<<<<<<<<<<< * * def skip_headers(self, input_file): */ - __pyx_t_1 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_v_self->num_docs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 91, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_v_self->num_docs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 93, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_v_self->num_terms); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 91, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_v_self->num_terms); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 93, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); - __pyx_t_3 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_v_self->num_nnz); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 91, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_v_self->num_nnz); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 93, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_4 = PyTuple_New(3); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 91, __pyx_L1_error) + __pyx_t_4 = PyTuple_New(3); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 93, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); __Pyx_GIVEREF(__pyx_t_1); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_1); @@ -2786,21 +2827,21 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_4__str__(struct __pyx_t_2 = 0; __pyx_t_3 = 0; - /* "gensim/corpora/_mmreader.pyx":90 + /* "gensim/corpora/_mmreader.pyx":92 * * def __str__(self): * return ("MmCorpus(%i documents, %i features, %i non-zero entries)" % # <<<<<<<<<<<<<< * (self.num_docs, self.num_terms, self.num_nnz)) * */ - __pyx_t_3 = __Pyx_PyString_Format(__pyx_kp_s_MmCorpus_i_documents_i_features, __pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 90, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyString_Format(__pyx_kp_s_MmCorpus_i_documents_i_features, __pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 92, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; __pyx_r = __pyx_t_3; __pyx_t_3 = 0; goto __pyx_L0; - /* "gensim/corpora/_mmreader.pyx":89 + /* "gensim/corpora/_mmreader.pyx":91 * return self.num_docs * * def __str__(self): # <<<<<<<<<<<<<< @@ -2822,7 +2863,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_4__str__(struct return __pyx_r; } -/* "gensim/corpora/_mmreader.pyx":93 +/* "gensim/corpora/_mmreader.pyx":95 * (self.num_docs, self.num_terms, self.num_nnz)) * * def skip_headers(self, input_file): # <<<<<<<<<<<<<< @@ -2856,7 +2897,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_6skip_headers(CY int __pyx_t_6; __Pyx_RefNannySetupContext("skip_headers", 0); - /* "gensim/corpora/_mmreader.pyx":102 + /* "gensim/corpora/_mmreader.pyx":104 * * """ * for line in input_file: # <<<<<<<<<<<<<< @@ -2867,26 +2908,26 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_6skip_headers(CY __pyx_t_1 = __pyx_v_input_file; __Pyx_INCREF(__pyx_t_1); __pyx_t_2 = 0; __pyx_t_3 = NULL; } else { - __pyx_t_2 = -1; __pyx_t_1 = PyObject_GetIter(__pyx_v_input_file); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 102, __pyx_L1_error) + __pyx_t_2 = -1; __pyx_t_1 = PyObject_GetIter(__pyx_v_input_file); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 104, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = Py_TYPE(__pyx_t_1)->tp_iternext; if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 102, __pyx_L1_error) + __pyx_t_3 = Py_TYPE(__pyx_t_1)->tp_iternext; if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 104, __pyx_L1_error) } for (;;) { if (likely(!__pyx_t_3)) { if (likely(PyList_CheckExact(__pyx_t_1))) { if (__pyx_t_2 >= PyList_GET_SIZE(__pyx_t_1)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_4 = PyList_GET_ITEM(__pyx_t_1, __pyx_t_2); __Pyx_INCREF(__pyx_t_4); __pyx_t_2++; if (unlikely(0 < 0)) __PYX_ERR(0, 102, __pyx_L1_error) + __pyx_t_4 = PyList_GET_ITEM(__pyx_t_1, __pyx_t_2); __Pyx_INCREF(__pyx_t_4); __pyx_t_2++; if (unlikely(0 < 0)) __PYX_ERR(0, 104, __pyx_L1_error) #else - __pyx_t_4 = PySequence_ITEM(__pyx_t_1, __pyx_t_2); __pyx_t_2++; if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 102, __pyx_L1_error) + __pyx_t_4 = PySequence_ITEM(__pyx_t_1, __pyx_t_2); __pyx_t_2++; if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 104, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); #endif } else { if (__pyx_t_2 >= PyTuple_GET_SIZE(__pyx_t_1)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_4 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_2); __Pyx_INCREF(__pyx_t_4); __pyx_t_2++; if (unlikely(0 < 0)) __PYX_ERR(0, 102, __pyx_L1_error) + __pyx_t_4 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_2); __Pyx_INCREF(__pyx_t_4); __pyx_t_2++; if (unlikely(0 < 0)) __PYX_ERR(0, 104, __pyx_L1_error) #else - __pyx_t_4 = PySequence_ITEM(__pyx_t_1, __pyx_t_2); __pyx_t_2++; if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 102, __pyx_L1_error) + __pyx_t_4 = PySequence_ITEM(__pyx_t_1, __pyx_t_2); __pyx_t_2++; if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 104, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); #endif } @@ -2896,7 +2937,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_6skip_headers(CY PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else __PYX_ERR(0, 102, __pyx_L1_error) + else __PYX_ERR(0, 104, __pyx_L1_error) } break; } @@ -2905,23 +2946,23 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_6skip_headers(CY __Pyx_XDECREF_SET(__pyx_v_line, __pyx_t_4); __pyx_t_4 = 0; - /* "gensim/corpora/_mmreader.pyx":103 + /* "gensim/corpora/_mmreader.pyx":105 * """ * for line in input_file: * if line.startswith(b'%'): # <<<<<<<<<<<<<< * continue * break */ - __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_v_line, __pyx_n_s_startswith); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 103, __pyx_L1_error) + __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_v_line, __pyx_n_s_startswith); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 105, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); - __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_tuple__5, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 103, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_tuple__5, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 105, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; - __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(0, 103, __pyx_L1_error) + __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_5); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(0, 105, __pyx_L1_error) __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; if (__pyx_t_6) { - /* "gensim/corpora/_mmreader.pyx":104 + /* "gensim/corpora/_mmreader.pyx":106 * for line in input_file: * if line.startswith(b'%'): * continue # <<<<<<<<<<<<<< @@ -2930,7 +2971,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_6skip_headers(CY */ goto __pyx_L3_continue; - /* "gensim/corpora/_mmreader.pyx":103 + /* "gensim/corpora/_mmreader.pyx":105 * """ * for line in input_file: * if line.startswith(b'%'): # <<<<<<<<<<<<<< @@ -2939,7 +2980,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_6skip_headers(CY */ } - /* "gensim/corpora/_mmreader.pyx":105 + /* "gensim/corpora/_mmreader.pyx":107 * if line.startswith(b'%'): * continue * break # <<<<<<<<<<<<<< @@ -2948,7 +2989,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_6skip_headers(CY */ goto __pyx_L4_break; - /* "gensim/corpora/_mmreader.pyx":102 + /* "gensim/corpora/_mmreader.pyx":104 * * """ * for line in input_file: # <<<<<<<<<<<<<< @@ -2960,7 +3001,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_6skip_headers(CY __pyx_L4_break:; __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - /* "gensim/corpora/_mmreader.pyx":93 + /* "gensim/corpora/_mmreader.pyx":95 * (self.num_docs, self.num_terms, self.num_nnz)) * * def skip_headers(self, input_file): # <<<<<<<<<<<<<< @@ -2985,17 +3026,17 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_6skip_headers(CY } static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__pyx_CoroutineObject *__pyx_generator, CYTHON_UNUSED PyThreadState *__pyx_tstate, PyObject *__pyx_sent_value); /* proto */ -/* "gensim/corpora/_mmreader.pyx":107 +/* "gensim/corpora/_mmreader.pyx":109 * break * * def __iter__(self): # <<<<<<<<<<<<<< - * """Iterate through corpus. + * """Iterate through all documents in the corpus. * */ /* Python wrapper */ static PyObject *__pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_9__iter__(PyObject *__pyx_v_self); /*proto*/ -static char __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_8__iter__[] = "Iterate through corpus.\n\n Notes\n ------\n Note that the total number of vectors returned is always equal to the number of rows specified\n in the header, empty documents are inserted and yielded where appropriate, even if they are not explicitly\n stored in the Matrix Market file.\n\n Yields\n ------\n (int, list of (int, number))\n Document id and Document in BoW format\n\n "; +static char __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_8__iter__[] = "Iterate through all documents in the corpus.\n\n Notes\n ------\n Note that the total number of vectors returned is always equal to the number of rows specified\n in the header: empty documents are inserted and yielded where appropriate, even if they are not explicitly\n stored in the Matrix Market file.\n\n Yields\n ------\n (int, list of (int, number))\n Document id and document in sparse bag-of-words format.\n\n "; #if CYTHON_COMPILING_IN_CPYTHON struct wrapperbase __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader_8__iter__; #endif @@ -3019,7 +3060,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_8__iter__(struct if (unlikely(!__pyx_cur_scope)) { __pyx_cur_scope = ((struct __pyx_obj_6gensim_7corpora_9_mmreader___pyx_scope_struct_2___iter__ *)Py_None); __Pyx_INCREF(Py_None); - __PYX_ERR(0, 107, __pyx_L1_error) + __PYX_ERR(0, 109, __pyx_L1_error) } else { __Pyx_GOTREF(__pyx_cur_scope); } @@ -3027,7 +3068,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_8__iter__(struct __Pyx_INCREF((PyObject *)__pyx_cur_scope->__pyx_v_self); __Pyx_GIVEREF((PyObject *)__pyx_cur_scope->__pyx_v_self); { - __pyx_CoroutineObject *gen = __Pyx_Generator_New((__pyx_coroutine_body_t) __pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator, (PyObject *) __pyx_cur_scope, __pyx_n_s_iter, __pyx_n_s_MmReader___iter, __pyx_n_s_gensim_corpora__mmreader); if (unlikely(!gen)) __PYX_ERR(0, 107, __pyx_L1_error) + __pyx_CoroutineObject *gen = __Pyx_Generator_New((__pyx_coroutine_body_t) __pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator, NULL, (PyObject *) __pyx_cur_scope, __pyx_n_s_iter, __pyx_n_s_MmReader___iter, __pyx_n_s_gensim_corpora__mmreader); if (unlikely(!gen)) __PYX_ERR(0, 109, __pyx_L1_error) __Pyx_DECREF(__pyx_cur_scope); __Pyx_RefNannyFinishContext(); return (PyObject *) gen; @@ -3083,9 +3124,9 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py return NULL; } __pyx_L3_first_run:; - if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 107, __pyx_L1_error) + if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 109, __pyx_L1_error) - /* "gensim/corpora/_mmreader.pyx":123 + /* "gensim/corpora/_mmreader.pyx":125 * """ * cdef long long docid, termid, previd * cdef double val = 0 # <<<<<<<<<<<<<< @@ -3094,7 +3135,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py */ __pyx_cur_scope->__pyx_v_val = 0.0; - /* "gensim/corpora/_mmreader.pyx":125 + /* "gensim/corpora/_mmreader.pyx":127 * cdef double val = 0 * * with utils.file_or_filename(self.input) as lines: # <<<<<<<<<<<<<< @@ -3102,9 +3143,9 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py * */ /*with:*/ { - __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_utils); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 125, __pyx_L1_error) + __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_utils); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 127, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_file_or_filename); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 125, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_file_or_filename); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 127, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __pyx_t_2 = NULL; @@ -3118,13 +3159,13 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py } } if (!__pyx_t_2) { - __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_cur_scope->__pyx_v_self->input); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 125, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_cur_scope->__pyx_v_self->input); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 127, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[2] = {__pyx_t_2, __pyx_cur_scope->__pyx_v_self->input}; - __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 125, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 127, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_GOTREF(__pyx_t_1); } else @@ -3132,27 +3173,27 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) { PyObject *__pyx_temp[2] = {__pyx_t_2, __pyx_cur_scope->__pyx_v_self->input}; - __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 125, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 127, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_GOTREF(__pyx_t_1); } else #endif { - __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 125, __pyx_L1_error) + __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 127, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); __Pyx_GIVEREF(__pyx_t_2); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_2); __pyx_t_2 = NULL; __Pyx_INCREF(__pyx_cur_scope->__pyx_v_self->input); __Pyx_GIVEREF(__pyx_cur_scope->__pyx_v_self->input); PyTuple_SET_ITEM(__pyx_t_4, 0+1, __pyx_cur_scope->__pyx_v_self->input); - __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 125, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 127, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; } } __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - __pyx_t_5 = __Pyx_PyObject_LookupSpecial(__pyx_t_1, __pyx_n_s_exit); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 125, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyObject_LookupSpecial(__pyx_t_1, __pyx_n_s_exit); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 127, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); - __pyx_t_4 = __Pyx_PyObject_LookupSpecial(__pyx_t_1, __pyx_n_s_enter); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 125, __pyx_L4_error) + __pyx_t_4 = __Pyx_PyObject_LookupSpecial(__pyx_t_1, __pyx_n_s_enter); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 127, __pyx_L4_error) __Pyx_GOTREF(__pyx_t_4); __pyx_t_2 = NULL; if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_4))) { @@ -3165,10 +3206,10 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py } } if (__pyx_t_2) { - __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_4, __pyx_t_2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 125, __pyx_L4_error) + __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_4, __pyx_t_2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 127, __pyx_L4_error) __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; } else { - __pyx_t_3 = __Pyx_PyObject_CallNoArg(__pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 125, __pyx_L4_error) + __pyx_t_3 = __Pyx_PyObject_CallNoArg(__pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 127, __pyx_L4_error) } __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; @@ -3186,14 +3227,14 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_cur_scope->__pyx_v_lines = __pyx_t_4; __pyx_t_4 = 0; - /* "gensim/corpora/_mmreader.pyx":126 + /* "gensim/corpora/_mmreader.pyx":128 * * with utils.file_or_filename(self.input) as lines: * self.skip_headers(lines) # <<<<<<<<<<<<<< * * previd = -1 */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_cur_scope->__pyx_v_self), __pyx_n_s_skip_headers); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 126, __pyx_L8_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_cur_scope->__pyx_v_self), __pyx_n_s_skip_headers); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 128, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_3 = NULL; if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_1))) { @@ -3206,13 +3247,13 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py } } if (!__pyx_t_3) { - __pyx_t_4 = __Pyx_PyObject_CallOneArg(__pyx_t_1, __pyx_cur_scope->__pyx_v_lines); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 126, __pyx_L8_error) + __pyx_t_4 = __Pyx_PyObject_CallOneArg(__pyx_t_1, __pyx_cur_scope->__pyx_v_lines); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 128, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_4); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_1)) { PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_cur_scope->__pyx_v_lines}; - __pyx_t_4 = __Pyx_PyFunction_FastCall(__pyx_t_1, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 126, __pyx_L8_error) + __pyx_t_4 = __Pyx_PyFunction_FastCall(__pyx_t_1, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 128, __pyx_L8_error) __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_GOTREF(__pyx_t_4); } else @@ -3220,19 +3261,19 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_1)) { PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_cur_scope->__pyx_v_lines}; - __pyx_t_4 = __Pyx_PyCFunction_FastCall(__pyx_t_1, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 126, __pyx_L8_error) + __pyx_t_4 = __Pyx_PyCFunction_FastCall(__pyx_t_1, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 128, __pyx_L8_error) __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_GOTREF(__pyx_t_4); } else #endif { - __pyx_t_2 = PyTuple_New(1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 126, __pyx_L8_error) + __pyx_t_2 = PyTuple_New(1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 128, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_3); __pyx_t_3 = NULL; __Pyx_INCREF(__pyx_cur_scope->__pyx_v_lines); __Pyx_GIVEREF(__pyx_cur_scope->__pyx_v_lines); PyTuple_SET_ITEM(__pyx_t_2, 0+1, __pyx_cur_scope->__pyx_v_lines); - __pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_2, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 126, __pyx_L8_error) + __pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_2, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 128, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_4); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; } @@ -3240,7 +3281,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; - /* "gensim/corpora/_mmreader.pyx":128 + /* "gensim/corpora/_mmreader.pyx":130 * self.skip_headers(lines) * * previd = -1 # <<<<<<<<<<<<<< @@ -3249,7 +3290,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py */ __pyx_cur_scope->__pyx_v_previd = -1LL; - /* "gensim/corpora/_mmreader.pyx":129 + /* "gensim/corpora/_mmreader.pyx":131 * * previd = -1 * for line in lines: # <<<<<<<<<<<<<< @@ -3260,26 +3301,26 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_t_4 = __pyx_cur_scope->__pyx_v_lines; __Pyx_INCREF(__pyx_t_4); __pyx_t_9 = 0; __pyx_t_10 = NULL; } else { - __pyx_t_9 = -1; __pyx_t_4 = PyObject_GetIter(__pyx_cur_scope->__pyx_v_lines); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 129, __pyx_L8_error) + __pyx_t_9 = -1; __pyx_t_4 = PyObject_GetIter(__pyx_cur_scope->__pyx_v_lines); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 131, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_4); - __pyx_t_10 = Py_TYPE(__pyx_t_4)->tp_iternext; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 129, __pyx_L8_error) + __pyx_t_10 = Py_TYPE(__pyx_t_4)->tp_iternext; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 131, __pyx_L8_error) } for (;;) { if (likely(!__pyx_t_10)) { if (likely(PyList_CheckExact(__pyx_t_4))) { if (__pyx_t_9 >= PyList_GET_SIZE(__pyx_t_4)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_1 = PyList_GET_ITEM(__pyx_t_4, __pyx_t_9); __Pyx_INCREF(__pyx_t_1); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 129, __pyx_L8_error) + __pyx_t_1 = PyList_GET_ITEM(__pyx_t_4, __pyx_t_9); __Pyx_INCREF(__pyx_t_1); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 131, __pyx_L8_error) #else - __pyx_t_1 = PySequence_ITEM(__pyx_t_4, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 129, __pyx_L8_error) + __pyx_t_1 = PySequence_ITEM(__pyx_t_4, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 131, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_1); #endif } else { if (__pyx_t_9 >= PyTuple_GET_SIZE(__pyx_t_4)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_4, __pyx_t_9); __Pyx_INCREF(__pyx_t_1); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 129, __pyx_L8_error) + __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_4, __pyx_t_9); __Pyx_INCREF(__pyx_t_1); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 131, __pyx_L8_error) #else - __pyx_t_1 = PySequence_ITEM(__pyx_t_4, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 129, __pyx_L8_error) + __pyx_t_1 = PySequence_ITEM(__pyx_t_4, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 131, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_1); #endif } @@ -3289,7 +3330,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else __PYX_ERR(0, 129, __pyx_L8_error) + else __PYX_ERR(0, 131, __pyx_L8_error) } break; } @@ -3300,25 +3341,25 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __Pyx_GIVEREF(__pyx_t_1); __pyx_t_1 = 0; - /* "gensim/corpora/_mmreader.pyx":131 + /* "gensim/corpora/_mmreader.pyx":133 * for line in lines: * * if (sscanf(line, "%lld %lld %lg", &docid, &termid, &val) != 3): # <<<<<<<<<<<<<< * raise ValueError("unable to parse line: {}".format(line)) * */ - __pyx_t_11 = __Pyx_PyObject_AsString(__pyx_cur_scope->__pyx_v_line); if (unlikely((!__pyx_t_11) && PyErr_Occurred())) __PYX_ERR(0, 131, __pyx_L8_error) + __pyx_t_11 = __Pyx_PyObject_AsString(__pyx_cur_scope->__pyx_v_line); if (unlikely((!__pyx_t_11) && PyErr_Occurred())) __PYX_ERR(0, 133, __pyx_L8_error) __pyx_t_12 = ((sscanf(__pyx_t_11, ((char const *)"%lld %lld %lg"), (&__pyx_cur_scope->__pyx_v_docid), (&__pyx_cur_scope->__pyx_v_termid), (&__pyx_cur_scope->__pyx_v_val)) != 3) != 0); - if (__pyx_t_12) { + if (unlikely(__pyx_t_12)) { - /* "gensim/corpora/_mmreader.pyx":132 + /* "gensim/corpora/_mmreader.pyx":134 * * if (sscanf(line, "%lld %lld %lg", &docid, &termid, &val) != 3): * raise ValueError("unable to parse line: {}".format(line)) # <<<<<<<<<<<<<< * * if not self.transposed: */ - __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_kp_s_unable_to_parse_line, __pyx_n_s_format); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 132, __pyx_L8_error) + __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_kp_s_unable_to_parse_line, __pyx_n_s_format); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 134, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_2); __pyx_t_3 = NULL; if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_2))) { @@ -3331,13 +3372,13 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py } } if (!__pyx_t_3) { - __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_cur_scope->__pyx_v_line); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 132, __pyx_L8_error) + __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_cur_scope->__pyx_v_line); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 134, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_1); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_2)) { PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_cur_scope->__pyx_v_line}; - __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 132, __pyx_L8_error) + __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 134, __pyx_L8_error) __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_GOTREF(__pyx_t_1); } else @@ -3345,37 +3386,32 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) { PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_cur_scope->__pyx_v_line}; - __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 132, __pyx_L8_error) + __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 134, __pyx_L8_error) __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_GOTREF(__pyx_t_1); } else #endif { - __pyx_t_13 = PyTuple_New(1+1); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 132, __pyx_L8_error) + __pyx_t_13 = PyTuple_New(1+1); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 134, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_13); __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_13, 0, __pyx_t_3); __pyx_t_3 = NULL; __Pyx_INCREF(__pyx_cur_scope->__pyx_v_line); __Pyx_GIVEREF(__pyx_cur_scope->__pyx_v_line); PyTuple_SET_ITEM(__pyx_t_13, 0+1, __pyx_cur_scope->__pyx_v_line); - __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_13, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 132, __pyx_L8_error) + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_13, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 134, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; } } __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - __pyx_t_2 = PyTuple_New(1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 132, __pyx_L8_error) + __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_builtin_ValueError, __pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 134, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_2); - __Pyx_GIVEREF(__pyx_t_1); - PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_1); - __pyx_t_1 = 0; - __pyx_t_1 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_t_2, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 132, __pyx_L8_error) - __Pyx_GOTREF(__pyx_t_1); - __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - __Pyx_Raise(__pyx_t_1, 0, 0, 0); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __PYX_ERR(0, 132, __pyx_L8_error) + __Pyx_Raise(__pyx_t_2, 0, 0, 0); + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __PYX_ERR(0, 134, __pyx_L8_error) - /* "gensim/corpora/_mmreader.pyx":131 + /* "gensim/corpora/_mmreader.pyx":133 * for line in lines: * * if (sscanf(line, "%lld %lld %lg", &docid, &termid, &val) != 3): # <<<<<<<<<<<<<< @@ -3384,7 +3420,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py */ } - /* "gensim/corpora/_mmreader.pyx":134 + /* "gensim/corpora/_mmreader.pyx":136 * raise ValueError("unable to parse line: {}".format(line)) * * if not self.transposed: # <<<<<<<<<<<<<< @@ -3394,7 +3430,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_t_12 = ((!(__pyx_cur_scope->__pyx_v_self->transposed != 0)) != 0); if (__pyx_t_12) { - /* "gensim/corpora/_mmreader.pyx":135 + /* "gensim/corpora/_mmreader.pyx":137 * * if not self.transposed: * termid, docid = docid, termid # <<<<<<<<<<<<<< @@ -3406,7 +3442,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_cur_scope->__pyx_v_termid = __pyx_t_14; __pyx_cur_scope->__pyx_v_docid = __pyx_t_15; - /* "gensim/corpora/_mmreader.pyx":134 + /* "gensim/corpora/_mmreader.pyx":136 * raise ValueError("unable to parse line: {}".format(line)) * * if not self.transposed: # <<<<<<<<<<<<<< @@ -3415,7 +3451,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py */ } - /* "gensim/corpora/_mmreader.pyx":138 + /* "gensim/corpora/_mmreader.pyx":140 * * # -1 because matrix market indexes are 1-based => convert to 0-based * docid -= 1 # <<<<<<<<<<<<<< @@ -3424,7 +3460,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py */ __pyx_cur_scope->__pyx_v_docid = (__pyx_cur_scope->__pyx_v_docid - 1); - /* "gensim/corpora/_mmreader.pyx":139 + /* "gensim/corpora/_mmreader.pyx":141 * # -1 because matrix market indexes are 1-based => convert to 0-based * docid -= 1 * termid -= 1 # <<<<<<<<<<<<<< @@ -3433,7 +3469,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py */ __pyx_cur_scope->__pyx_v_termid = (__pyx_cur_scope->__pyx_v_termid - 1); - /* "gensim/corpora/_mmreader.pyx":141 + /* "gensim/corpora/_mmreader.pyx":143 * termid -= 1 * * assert previd <= docid, "matrix columns must come in ascending order" # <<<<<<<<<<<<<< @@ -3444,12 +3480,12 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py if (unlikely(!Py_OptimizeFlag)) { if (unlikely(!((__pyx_cur_scope->__pyx_v_previd <= __pyx_cur_scope->__pyx_v_docid) != 0))) { PyErr_SetObject(PyExc_AssertionError, __pyx_kp_s_matrix_columns_must_come_in_asce); - __PYX_ERR(0, 141, __pyx_L8_error) + __PYX_ERR(0, 143, __pyx_L8_error) } } #endif - /* "gensim/corpora/_mmreader.pyx":142 + /* "gensim/corpora/_mmreader.pyx":144 * * assert previd <= docid, "matrix columns must come in ascending order" * if docid != previd: # <<<<<<<<<<<<<< @@ -3459,7 +3495,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_t_12 = ((__pyx_cur_scope->__pyx_v_docid != __pyx_cur_scope->__pyx_v_previd) != 0); if (__pyx_t_12) { - /* "gensim/corpora/_mmreader.pyx":144 + /* "gensim/corpora/_mmreader.pyx":146 * if docid != previd: * # change of document: return the document read so far (its id is prevId) * if previd >= 0: # <<<<<<<<<<<<<< @@ -3469,26 +3505,26 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_t_12 = ((__pyx_cur_scope->__pyx_v_previd >= 0) != 0); if (__pyx_t_12) { - /* "gensim/corpora/_mmreader.pyx":145 + /* "gensim/corpora/_mmreader.pyx":147 * # change of document: return the document read so far (its id is prevId) * if previd >= 0: * yield previd, document # noqa:F821 # <<<<<<<<<<<<<< * * # return implicit (empty) documents between previous id and new id */ - __pyx_t_1 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_cur_scope->__pyx_v_previd); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 145, __pyx_L8_error) - __Pyx_GOTREF(__pyx_t_1); - if (unlikely(!__pyx_cur_scope->__pyx_v_document)) { __Pyx_RaiseUnboundLocalError("document"); __PYX_ERR(0, 145, __pyx_L8_error) } - __pyx_t_2 = PyTuple_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 145, __pyx_L8_error) + __pyx_t_2 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_cur_scope->__pyx_v_previd); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 147, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_2); - __Pyx_GIVEREF(__pyx_t_1); - PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_1); + if (unlikely(!__pyx_cur_scope->__pyx_v_document)) { __Pyx_RaiseUnboundLocalError("document"); __PYX_ERR(0, 147, __pyx_L8_error) } + __pyx_t_1 = PyTuple_New(2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 147, __pyx_L8_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_GIVEREF(__pyx_t_2); + PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_2); __Pyx_INCREF(__pyx_cur_scope->__pyx_v_document); __Pyx_GIVEREF(__pyx_cur_scope->__pyx_v_document); - PyTuple_SET_ITEM(__pyx_t_2, 1, __pyx_cur_scope->__pyx_v_document); - __pyx_t_1 = 0; - __pyx_r = __pyx_t_2; + PyTuple_SET_ITEM(__pyx_t_1, 1, __pyx_cur_scope->__pyx_v_document); __pyx_t_2 = 0; + __pyx_r = __pyx_t_1; + __pyx_t_1 = 0; __Pyx_XGIVEREF(__pyx_t_4); __pyx_cur_scope->__pyx_t_0 = __pyx_t_4; __Pyx_XGIVEREF(__pyx_t_5); @@ -3525,9 +3561,9 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __Pyx_XGOTREF(__pyx_t_8); __pyx_t_9 = __pyx_cur_scope->__pyx_t_5; __pyx_t_10 = __pyx_cur_scope->__pyx_t_6; - if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 145, __pyx_L8_error) + if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 147, __pyx_L8_error) - /* "gensim/corpora/_mmreader.pyx":144 + /* "gensim/corpora/_mmreader.pyx":146 * if docid != previd: * # change of document: return the document read so far (its id is prevId) * if previd >= 0: # <<<<<<<<<<<<<< @@ -3536,53 +3572,53 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py */ } - /* "gensim/corpora/_mmreader.pyx":149 + /* "gensim/corpora/_mmreader.pyx":151 * # return implicit (empty) documents between previous id and new id * # too, to keep consistent document numbering and corpus length * for previd in xrange(previd + 1, docid): # <<<<<<<<<<<<<< * yield previd, [] * */ - __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_xrange); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 149, __pyx_L8_error) - __Pyx_GOTREF(__pyx_t_1); - __pyx_t_13 = __Pyx_PyInt_From_PY_LONG_LONG((__pyx_cur_scope->__pyx_v_previd + 1)); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 149, __pyx_L8_error) + __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_xrange); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 151, __pyx_L8_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_13 = __Pyx_PyInt_From_PY_LONG_LONG((__pyx_cur_scope->__pyx_v_previd + 1)); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 151, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_13); - __pyx_t_3 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_cur_scope->__pyx_v_docid); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 149, __pyx_L8_error) + __pyx_t_3 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_cur_scope->__pyx_v_docid); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 151, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_3); __pyx_t_16 = NULL; __pyx_t_17 = 0; - if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_1))) { - __pyx_t_16 = PyMethod_GET_SELF(__pyx_t_1); + if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) { + __pyx_t_16 = PyMethod_GET_SELF(__pyx_t_2); if (likely(__pyx_t_16)) { - PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_1); + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2); __Pyx_INCREF(__pyx_t_16); __Pyx_INCREF(function); - __Pyx_DECREF_SET(__pyx_t_1, function); + __Pyx_DECREF_SET(__pyx_t_2, function); __pyx_t_17 = 1; } } #if CYTHON_FAST_PYCALL - if (PyFunction_Check(__pyx_t_1)) { + if (PyFunction_Check(__pyx_t_2)) { PyObject *__pyx_temp[3] = {__pyx_t_16, __pyx_t_13, __pyx_t_3}; - __pyx_t_2 = __Pyx_PyFunction_FastCall(__pyx_t_1, __pyx_temp+1-__pyx_t_17, 2+__pyx_t_17); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 149, __pyx_L8_error) + __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-__pyx_t_17, 2+__pyx_t_17); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 151, __pyx_L8_error) __Pyx_XDECREF(__pyx_t_16); __pyx_t_16 = 0; - __Pyx_GOTREF(__pyx_t_2); + __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; } else #endif #if CYTHON_FAST_PYCCALL - if (__Pyx_PyFastCFunction_Check(__pyx_t_1)) { + if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) { PyObject *__pyx_temp[3] = {__pyx_t_16, __pyx_t_13, __pyx_t_3}; - __pyx_t_2 = __Pyx_PyCFunction_FastCall(__pyx_t_1, __pyx_temp+1-__pyx_t_17, 2+__pyx_t_17); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 149, __pyx_L8_error) + __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-__pyx_t_17, 2+__pyx_t_17); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 151, __pyx_L8_error) __Pyx_XDECREF(__pyx_t_16); __pyx_t_16 = 0; - __Pyx_GOTREF(__pyx_t_2); + __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; } else #endif { - __pyx_t_18 = PyTuple_New(2+__pyx_t_17); if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 149, __pyx_L8_error) + __pyx_t_18 = PyTuple_New(2+__pyx_t_17); if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 151, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_18); if (__pyx_t_16) { __Pyx_GIVEREF(__pyx_t_16); PyTuple_SET_ITEM(__pyx_t_18, 0, __pyx_t_16); __pyx_t_16 = NULL; @@ -3593,78 +3629,78 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py PyTuple_SET_ITEM(__pyx_t_18, 1+__pyx_t_17, __pyx_t_3); __pyx_t_13 = 0; __pyx_t_3 = 0; - __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_18, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 149, __pyx_L8_error) - __Pyx_GOTREF(__pyx_t_2); + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_18, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 151, __pyx_L8_error) + __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_18); __pyx_t_18 = 0; } - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - if (likely(PyList_CheckExact(__pyx_t_2)) || PyTuple_CheckExact(__pyx_t_2)) { - __pyx_t_1 = __pyx_t_2; __Pyx_INCREF(__pyx_t_1); __pyx_t_19 = 0; + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + if (likely(PyList_CheckExact(__pyx_t_1)) || PyTuple_CheckExact(__pyx_t_1)) { + __pyx_t_2 = __pyx_t_1; __Pyx_INCREF(__pyx_t_2); __pyx_t_19 = 0; __pyx_t_20 = NULL; } else { - __pyx_t_19 = -1; __pyx_t_1 = PyObject_GetIter(__pyx_t_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 149, __pyx_L8_error) - __Pyx_GOTREF(__pyx_t_1); - __pyx_t_20 = Py_TYPE(__pyx_t_1)->tp_iternext; if (unlikely(!__pyx_t_20)) __PYX_ERR(0, 149, __pyx_L8_error) + __pyx_t_19 = -1; __pyx_t_2 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 151, __pyx_L8_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_20 = Py_TYPE(__pyx_t_2)->tp_iternext; if (unlikely(!__pyx_t_20)) __PYX_ERR(0, 151, __pyx_L8_error) } - __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; for (;;) { if (likely(!__pyx_t_20)) { - if (likely(PyList_CheckExact(__pyx_t_1))) { - if (__pyx_t_19 >= PyList_GET_SIZE(__pyx_t_1)) break; + if (likely(PyList_CheckExact(__pyx_t_2))) { + if (__pyx_t_19 >= PyList_GET_SIZE(__pyx_t_2)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_2 = PyList_GET_ITEM(__pyx_t_1, __pyx_t_19); __Pyx_INCREF(__pyx_t_2); __pyx_t_19++; if (unlikely(0 < 0)) __PYX_ERR(0, 149, __pyx_L8_error) + __pyx_t_1 = PyList_GET_ITEM(__pyx_t_2, __pyx_t_19); __Pyx_INCREF(__pyx_t_1); __pyx_t_19++; if (unlikely(0 < 0)) __PYX_ERR(0, 151, __pyx_L8_error) #else - __pyx_t_2 = PySequence_ITEM(__pyx_t_1, __pyx_t_19); __pyx_t_19++; if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 149, __pyx_L8_error) - __Pyx_GOTREF(__pyx_t_2); + __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_19); __pyx_t_19++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 151, __pyx_L8_error) + __Pyx_GOTREF(__pyx_t_1); #endif } else { - if (__pyx_t_19 >= PyTuple_GET_SIZE(__pyx_t_1)) break; + if (__pyx_t_19 >= PyTuple_GET_SIZE(__pyx_t_2)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_2 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_19); __Pyx_INCREF(__pyx_t_2); __pyx_t_19++; if (unlikely(0 < 0)) __PYX_ERR(0, 149, __pyx_L8_error) + __pyx_t_1 = PyTuple_GET_ITEM(__pyx_t_2, __pyx_t_19); __Pyx_INCREF(__pyx_t_1); __pyx_t_19++; if (unlikely(0 < 0)) __PYX_ERR(0, 151, __pyx_L8_error) #else - __pyx_t_2 = PySequence_ITEM(__pyx_t_1, __pyx_t_19); __pyx_t_19++; if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 149, __pyx_L8_error) - __Pyx_GOTREF(__pyx_t_2); + __pyx_t_1 = PySequence_ITEM(__pyx_t_2, __pyx_t_19); __pyx_t_19++; if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 151, __pyx_L8_error) + __Pyx_GOTREF(__pyx_t_1); #endif } } else { - __pyx_t_2 = __pyx_t_20(__pyx_t_1); - if (unlikely(!__pyx_t_2)) { + __pyx_t_1 = __pyx_t_20(__pyx_t_2); + if (unlikely(!__pyx_t_1)) { PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else __PYX_ERR(0, 149, __pyx_L8_error) + else __PYX_ERR(0, 151, __pyx_L8_error) } break; } - __Pyx_GOTREF(__pyx_t_2); + __Pyx_GOTREF(__pyx_t_1); } - __pyx_t_15 = __Pyx_PyInt_As_PY_LONG_LONG(__pyx_t_2); if (unlikely((__pyx_t_15 == (PY_LONG_LONG)-1) && PyErr_Occurred())) __PYX_ERR(0, 149, __pyx_L8_error) - __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_t_15 = __Pyx_PyInt_As_PY_LONG_LONG(__pyx_t_1); if (unlikely((__pyx_t_15 == (PY_LONG_LONG)-1) && PyErr_Occurred())) __PYX_ERR(0, 151, __pyx_L8_error) + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_cur_scope->__pyx_v_previd = __pyx_t_15; - /* "gensim/corpora/_mmreader.pyx":150 + /* "gensim/corpora/_mmreader.pyx":152 * # too, to keep consistent document numbering and corpus length * for previd in xrange(previd + 1, docid): * yield previd, [] # <<<<<<<<<<<<<< * * # from now on start adding fields to a new document, with a new id */ - __pyx_t_2 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_cur_scope->__pyx_v_previd); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 150, __pyx_L8_error) - __Pyx_GOTREF(__pyx_t_2); - __pyx_t_18 = PyList_New(0); if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 150, __pyx_L8_error) + __pyx_t_1 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_cur_scope->__pyx_v_previd); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 152, __pyx_L8_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_18 = PyList_New(0); if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 152, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_18); - __pyx_t_3 = PyTuple_New(2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 150, __pyx_L8_error) + __pyx_t_3 = PyTuple_New(2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 152, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_3); - __Pyx_GIVEREF(__pyx_t_2); - PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_2); + __Pyx_GIVEREF(__pyx_t_1); + PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_1); __Pyx_GIVEREF(__pyx_t_18); PyTuple_SET_ITEM(__pyx_t_3, 1, __pyx_t_18); - __pyx_t_2 = 0; + __pyx_t_1 = 0; __pyx_t_18 = 0; __pyx_r = __pyx_t_3; __pyx_t_3 = 0; - __Pyx_XGIVEREF(__pyx_t_1); - __pyx_cur_scope->__pyx_t_0 = __pyx_t_1; + __Pyx_XGIVEREF(__pyx_t_2); + __pyx_cur_scope->__pyx_t_0 = __pyx_t_2; __Pyx_XGIVEREF(__pyx_t_4); __pyx_cur_scope->__pyx_t_1 = __pyx_t_4; __Pyx_XGIVEREF(__pyx_t_5); @@ -3686,9 +3722,9 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_generator->resume_label = 2; return __pyx_r; __pyx_L23_resume_from_yield:; - __pyx_t_1 = __pyx_cur_scope->__pyx_t_0; + __pyx_t_2 = __pyx_cur_scope->__pyx_t_0; __pyx_cur_scope->__pyx_t_0 = 0; - __Pyx_XGOTREF(__pyx_t_1); + __Pyx_XGOTREF(__pyx_t_2); __pyx_t_4 = __pyx_cur_scope->__pyx_t_1; __pyx_cur_scope->__pyx_t_1 = 0; __Pyx_XGOTREF(__pyx_t_4); @@ -3708,9 +3744,9 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_t_10 = __pyx_cur_scope->__pyx_t_6; __pyx_t_19 = __pyx_cur_scope->__pyx_t_8; __pyx_t_20 = __pyx_cur_scope->__pyx_t_9; - if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 150, __pyx_L8_error) + if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 152, __pyx_L8_error) - /* "gensim/corpora/_mmreader.pyx":149 + /* "gensim/corpora/_mmreader.pyx":151 * # return implicit (empty) documents between previous id and new id * # too, to keep consistent document numbering and corpus length * for previd in xrange(previd + 1, docid): # <<<<<<<<<<<<<< @@ -3718,9 +3754,9 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py * */ } - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - /* "gensim/corpora/_mmreader.pyx":153 + /* "gensim/corpora/_mmreader.pyx":155 * * # from now on start adding fields to a new document, with a new id * previd = docid # <<<<<<<<<<<<<< @@ -3729,21 +3765,21 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py */ __pyx_cur_scope->__pyx_v_previd = __pyx_cur_scope->__pyx_v_docid; - /* "gensim/corpora/_mmreader.pyx":154 + /* "gensim/corpora/_mmreader.pyx":156 * # from now on start adding fields to a new document, with a new id * previd = docid * document = [] # <<<<<<<<<<<<<< * * document.append((termid, val,)) # add another field to the current document */ - __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 154, __pyx_L8_error) - __Pyx_GOTREF(__pyx_t_1); + __pyx_t_2 = PyList_New(0); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 156, __pyx_L8_error) + __Pyx_GOTREF(__pyx_t_2); __Pyx_XGOTREF(__pyx_cur_scope->__pyx_v_document); - __Pyx_XDECREF_SET(__pyx_cur_scope->__pyx_v_document, ((PyObject*)__pyx_t_1)); - __Pyx_GIVEREF(__pyx_t_1); - __pyx_t_1 = 0; + __Pyx_XDECREF_SET(__pyx_cur_scope->__pyx_v_document, ((PyObject*)__pyx_t_2)); + __Pyx_GIVEREF(__pyx_t_2); + __pyx_t_2 = 0; - /* "gensim/corpora/_mmreader.pyx":142 + /* "gensim/corpora/_mmreader.pyx":144 * * assert previd <= docid, "matrix columns must come in ascending order" * if docid != previd: # <<<<<<<<<<<<<< @@ -3752,30 +3788,30 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py */ } - /* "gensim/corpora/_mmreader.pyx":156 + /* "gensim/corpora/_mmreader.pyx":158 * document = [] * * document.append((termid, val,)) # add another field to the current document # <<<<<<<<<<<<<< * * # handle the last document, as a special case */ - if (unlikely(!__pyx_cur_scope->__pyx_v_document)) { __Pyx_RaiseUnboundLocalError("document"); __PYX_ERR(0, 156, __pyx_L8_error) } - __pyx_t_1 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_cur_scope->__pyx_v_termid); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 156, __pyx_L8_error) - __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = PyFloat_FromDouble(__pyx_cur_scope->__pyx_v_val); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 156, __pyx_L8_error) + if (unlikely(!__pyx_cur_scope->__pyx_v_document)) { __Pyx_RaiseUnboundLocalError("document"); __PYX_ERR(0, 158, __pyx_L8_error) } + __pyx_t_2 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_cur_scope->__pyx_v_termid); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 158, __pyx_L8_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_3 = PyFloat_FromDouble(__pyx_cur_scope->__pyx_v_val); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 158, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_18 = PyTuple_New(2); if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 156, __pyx_L8_error) + __pyx_t_18 = PyTuple_New(2); if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 158, __pyx_L8_error) __Pyx_GOTREF(__pyx_t_18); - __Pyx_GIVEREF(__pyx_t_1); - PyTuple_SET_ITEM(__pyx_t_18, 0, __pyx_t_1); + __Pyx_GIVEREF(__pyx_t_2); + PyTuple_SET_ITEM(__pyx_t_18, 0, __pyx_t_2); __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_18, 1, __pyx_t_3); - __pyx_t_1 = 0; + __pyx_t_2 = 0; __pyx_t_3 = 0; - __pyx_t_21 = __Pyx_PyList_Append(__pyx_cur_scope->__pyx_v_document, __pyx_t_18); if (unlikely(__pyx_t_21 == ((int)-1))) __PYX_ERR(0, 156, __pyx_L8_error) + __pyx_t_21 = __Pyx_PyList_Append(__pyx_cur_scope->__pyx_v_document, __pyx_t_18); if (unlikely(__pyx_t_21 == ((int)-1))) __PYX_ERR(0, 158, __pyx_L8_error) __Pyx_DECREF(__pyx_t_18); __pyx_t_18 = 0; - /* "gensim/corpora/_mmreader.pyx":129 + /* "gensim/corpora/_mmreader.pyx":131 * * previd = -1 * for line in lines: # <<<<<<<<<<<<<< @@ -3785,7 +3821,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py } __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; - /* "gensim/corpora/_mmreader.pyx":125 + /* "gensim/corpora/_mmreader.pyx":127 * cdef double val = 0 * * with utils.file_or_filename(self.input) as lines: # <<<<<<<<<<<<<< @@ -3800,27 +3836,27 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_L8_error:; __Pyx_XDECREF(__pyx_t_16); __pyx_t_16 = 0; __Pyx_XDECREF(__pyx_t_13); __pyx_t_13 = 0; - __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0; + __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0; __Pyx_XDECREF(__pyx_t_18); __pyx_t_18 = 0; __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0; /*except:*/ { __Pyx_AddTraceback("gensim.corpora._mmreader.MmReader.__iter__", __pyx_clineno, __pyx_lineno, __pyx_filename); - if (__Pyx_GetException(&__pyx_t_4, &__pyx_t_18, &__pyx_t_3) < 0) __PYX_ERR(0, 125, __pyx_L10_except_error) + if (__Pyx_GetException(&__pyx_t_4, &__pyx_t_18, &__pyx_t_3) < 0) __PYX_ERR(0, 127, __pyx_L10_except_error) __Pyx_GOTREF(__pyx_t_4); __Pyx_GOTREF(__pyx_t_18); __Pyx_GOTREF(__pyx_t_3); - __pyx_t_1 = PyTuple_Pack(3, __pyx_t_4, __pyx_t_18, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 125, __pyx_L10_except_error) - __Pyx_GOTREF(__pyx_t_1); - __pyx_t_22 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_1, NULL); + __pyx_t_2 = PyTuple_Pack(3, __pyx_t_4, __pyx_t_18, __pyx_t_3); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 127, __pyx_L10_except_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_22 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_2, NULL); __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - if (unlikely(!__pyx_t_22)) __PYX_ERR(0, 125, __pyx_L10_except_error) + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + if (unlikely(!__pyx_t_22)) __PYX_ERR(0, 127, __pyx_L10_except_error) __Pyx_GOTREF(__pyx_t_22); __pyx_t_12 = __Pyx_PyObject_IsTrue(__pyx_t_22); __Pyx_DECREF(__pyx_t_22); __pyx_t_22 = 0; - if (__pyx_t_12 < 0) __PYX_ERR(0, 125, __pyx_L10_except_error) + if (__pyx_t_12 < 0) __PYX_ERR(0, 127, __pyx_L10_except_error) __pyx_t_23 = ((!(__pyx_t_12 != 0)) != 0); if (__pyx_t_23) { __Pyx_GIVEREF(__pyx_t_4); @@ -3828,7 +3864,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __Pyx_XGIVEREF(__pyx_t_3); __Pyx_ErrRestoreWithState(__pyx_t_4, __pyx_t_18, __pyx_t_3); __pyx_t_4 = 0; __pyx_t_18 = 0; __pyx_t_3 = 0; - __PYX_ERR(0, 125, __pyx_L10_except_error) + __PYX_ERR(0, 127, __pyx_L10_except_error) } __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; __Pyx_DECREF(__pyx_t_18); __pyx_t_18 = 0; @@ -3854,7 +3890,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py if (__pyx_t_5) { __pyx_t_8 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_tuple__6, NULL); __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; - if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 125, __pyx_L1_error) + if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 127, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_8); __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; } @@ -3869,7 +3905,7 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_L27:; } - /* "gensim/corpora/_mmreader.pyx":159 + /* "gensim/corpora/_mmreader.pyx":161 * * # handle the last document, as a special case * if previd >= 0: # <<<<<<<<<<<<<< @@ -3879,17 +3915,17 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_t_23 = ((__pyx_cur_scope->__pyx_v_previd >= 0) != 0); if (__pyx_t_23) { - /* "gensim/corpora/_mmreader.pyx":160 + /* "gensim/corpora/_mmreader.pyx":162 * # handle the last document, as a special case * if previd >= 0: * yield previd, document # <<<<<<<<<<<<<< * * # return empty documents between the last explicit document and the number */ - __pyx_t_3 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_cur_scope->__pyx_v_previd); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 160, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_cur_scope->__pyx_v_previd); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 162, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - if (unlikely(!__pyx_cur_scope->__pyx_v_document)) { __Pyx_RaiseUnboundLocalError("document"); __PYX_ERR(0, 160, __pyx_L1_error) } - __pyx_t_18 = PyTuple_New(2); if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 160, __pyx_L1_error) + if (unlikely(!__pyx_cur_scope->__pyx_v_document)) { __Pyx_RaiseUnboundLocalError("document"); __PYX_ERR(0, 162, __pyx_L1_error) } + __pyx_t_18 = PyTuple_New(2); if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 162, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_18); __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_18, 0, __pyx_t_3); @@ -3906,9 +3942,9 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_generator->resume_label = 3; return __pyx_r; __pyx_L29_resume_from_yield:; - if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 160, __pyx_L1_error) + if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 162, __pyx_L1_error) - /* "gensim/corpora/_mmreader.pyx":159 + /* "gensim/corpora/_mmreader.pyx":161 * * # handle the last document, as a special case * if previd >= 0: # <<<<<<<<<<<<<< @@ -3917,26 +3953,26 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py */ } - /* "gensim/corpora/_mmreader.pyx":164 + /* "gensim/corpora/_mmreader.pyx":166 * # return empty documents between the last explicit document and the number * # of documents as specified in the header * for previd in xrange(previd + 1, self.num_docs): # <<<<<<<<<<<<<< * yield previd, [] * */ - __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_xrange); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 164, __pyx_L1_error) + __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_xrange); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 166, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_4 = __Pyx_PyInt_From_PY_LONG_LONG((__pyx_cur_scope->__pyx_v_previd + 1)); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 164, __pyx_L1_error) + __pyx_t_4 = __Pyx_PyInt_From_PY_LONG_LONG((__pyx_cur_scope->__pyx_v_previd + 1)); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 166, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); - __pyx_t_1 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_cur_scope->__pyx_v_self->num_docs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 164, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = NULL; + __pyx_t_2 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_cur_scope->__pyx_v_self->num_docs); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 166, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_1 = NULL; __pyx_t_17 = 0; if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) { - __pyx_t_2 = PyMethod_GET_SELF(__pyx_t_3); - if (likely(__pyx_t_2)) { + __pyx_t_1 = PyMethod_GET_SELF(__pyx_t_3); + if (likely(__pyx_t_1)) { PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3); - __Pyx_INCREF(__pyx_t_2); + __Pyx_INCREF(__pyx_t_1); __Pyx_INCREF(function); __Pyx_DECREF_SET(__pyx_t_3, function); __pyx_t_17 = 1; @@ -3944,37 +3980,37 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py } #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_3)) { - PyObject *__pyx_temp[3] = {__pyx_t_2, __pyx_t_4, __pyx_t_1}; - __pyx_t_18 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_17, 2+__pyx_t_17); if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 164, __pyx_L1_error) - __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; + PyObject *__pyx_temp[3] = {__pyx_t_1, __pyx_t_4, __pyx_t_2}; + __pyx_t_18 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_17, 2+__pyx_t_17); if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 166, __pyx_L1_error) + __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_GOTREF(__pyx_t_18); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; } else #endif #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) { - PyObject *__pyx_temp[3] = {__pyx_t_2, __pyx_t_4, __pyx_t_1}; - __pyx_t_18 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_17, 2+__pyx_t_17); if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 164, __pyx_L1_error) - __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0; + PyObject *__pyx_temp[3] = {__pyx_t_1, __pyx_t_4, __pyx_t_2}; + __pyx_t_18 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-__pyx_t_17, 2+__pyx_t_17); if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 166, __pyx_L1_error) + __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_GOTREF(__pyx_t_18); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; } else #endif { - __pyx_t_13 = PyTuple_New(2+__pyx_t_17); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 164, __pyx_L1_error) + __pyx_t_13 = PyTuple_New(2+__pyx_t_17); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 166, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_13); - if (__pyx_t_2) { - __Pyx_GIVEREF(__pyx_t_2); PyTuple_SET_ITEM(__pyx_t_13, 0, __pyx_t_2); __pyx_t_2 = NULL; + if (__pyx_t_1) { + __Pyx_GIVEREF(__pyx_t_1); PyTuple_SET_ITEM(__pyx_t_13, 0, __pyx_t_1); __pyx_t_1 = NULL; } __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_13, 0+__pyx_t_17, __pyx_t_4); - __Pyx_GIVEREF(__pyx_t_1); - PyTuple_SET_ITEM(__pyx_t_13, 1+__pyx_t_17, __pyx_t_1); + __Pyx_GIVEREF(__pyx_t_2); + PyTuple_SET_ITEM(__pyx_t_13, 1+__pyx_t_17, __pyx_t_2); __pyx_t_4 = 0; - __pyx_t_1 = 0; - __pyx_t_18 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_13, NULL); if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 164, __pyx_L1_error) + __pyx_t_2 = 0; + __pyx_t_18 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_13, NULL); if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 166, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_18); __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; } @@ -3983,9 +4019,9 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __pyx_t_3 = __pyx_t_18; __Pyx_INCREF(__pyx_t_3); __pyx_t_9 = 0; __pyx_t_10 = NULL; } else { - __pyx_t_9 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_t_18); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 164, __pyx_L1_error) + __pyx_t_9 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_t_18); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 166, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_10 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 164, __pyx_L1_error) + __pyx_t_10 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 166, __pyx_L1_error) } __Pyx_DECREF(__pyx_t_18); __pyx_t_18 = 0; for (;;) { @@ -3993,17 +4029,17 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py if (likely(PyList_CheckExact(__pyx_t_3))) { if (__pyx_t_9 >= PyList_GET_SIZE(__pyx_t_3)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_18 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_9); __Pyx_INCREF(__pyx_t_18); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 164, __pyx_L1_error) + __pyx_t_18 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_9); __Pyx_INCREF(__pyx_t_18); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 166, __pyx_L1_error) #else - __pyx_t_18 = PySequence_ITEM(__pyx_t_3, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 164, __pyx_L1_error) + __pyx_t_18 = PySequence_ITEM(__pyx_t_3, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 166, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_18); #endif } else { if (__pyx_t_9 >= PyTuple_GET_SIZE(__pyx_t_3)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_18 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_9); __Pyx_INCREF(__pyx_t_18); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 164, __pyx_L1_error) + __pyx_t_18 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_9); __Pyx_INCREF(__pyx_t_18); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 166, __pyx_L1_error) #else - __pyx_t_18 = PySequence_ITEM(__pyx_t_3, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 164, __pyx_L1_error) + __pyx_t_18 = PySequence_ITEM(__pyx_t_3, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 166, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_18); #endif } @@ -4013,37 +4049,37 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else __PYX_ERR(0, 164, __pyx_L1_error) + else __PYX_ERR(0, 166, __pyx_L1_error) } break; } __Pyx_GOTREF(__pyx_t_18); } - __pyx_t_15 = __Pyx_PyInt_As_PY_LONG_LONG(__pyx_t_18); if (unlikely((__pyx_t_15 == (PY_LONG_LONG)-1) && PyErr_Occurred())) __PYX_ERR(0, 164, __pyx_L1_error) + __pyx_t_15 = __Pyx_PyInt_As_PY_LONG_LONG(__pyx_t_18); if (unlikely((__pyx_t_15 == (PY_LONG_LONG)-1) && PyErr_Occurred())) __PYX_ERR(0, 166, __pyx_L1_error) __Pyx_DECREF(__pyx_t_18); __pyx_t_18 = 0; __pyx_cur_scope->__pyx_v_previd = __pyx_t_15; - /* "gensim/corpora/_mmreader.pyx":165 + /* "gensim/corpora/_mmreader.pyx":167 * # of documents as specified in the header * for previd in xrange(previd + 1, self.num_docs): * yield previd, [] # <<<<<<<<<<<<<< * * def docbyoffset(self, offset): */ - __pyx_t_18 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_cur_scope->__pyx_v_previd); if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 165, __pyx_L1_error) + __pyx_t_18 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_cur_scope->__pyx_v_previd); if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 167, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_18); - __pyx_t_13 = PyList_New(0); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 165, __pyx_L1_error) + __pyx_t_13 = PyList_New(0); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 167, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_13); - __pyx_t_1 = PyTuple_New(2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 165, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_1); + __pyx_t_2 = PyTuple_New(2); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 167, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); __Pyx_GIVEREF(__pyx_t_18); - PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_18); + PyTuple_SET_ITEM(__pyx_t_2, 0, __pyx_t_18); __Pyx_GIVEREF(__pyx_t_13); - PyTuple_SET_ITEM(__pyx_t_1, 1, __pyx_t_13); + PyTuple_SET_ITEM(__pyx_t_2, 1, __pyx_t_13); __pyx_t_18 = 0; __pyx_t_13 = 0; - __pyx_r = __pyx_t_1; - __pyx_t_1 = 0; + __pyx_r = __pyx_t_2; + __pyx_t_2 = 0; __Pyx_XGIVEREF(__pyx_t_3); __pyx_cur_scope->__pyx_t_0 = __pyx_t_3; __pyx_cur_scope->__pyx_t_5 = __pyx_t_9; @@ -4060,9 +4096,9 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __Pyx_XGOTREF(__pyx_t_3); __pyx_t_9 = __pyx_cur_scope->__pyx_t_5; __pyx_t_10 = __pyx_cur_scope->__pyx_t_6; - if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 165, __pyx_L1_error) + if (unlikely(!__pyx_sent_value)) __PYX_ERR(0, 167, __pyx_L1_error) - /* "gensim/corpora/_mmreader.pyx":164 + /* "gensim/corpora/_mmreader.pyx":166 * # return empty documents between the last explicit document and the number * # of documents as specified in the header * for previd in xrange(previd + 1, self.num_docs): # <<<<<<<<<<<<<< @@ -4073,11 +4109,11 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; CYTHON_MAYBE_UNUSED_VAR(__pyx_cur_scope); - /* "gensim/corpora/_mmreader.pyx":107 + /* "gensim/corpora/_mmreader.pyx":109 * break * * def __iter__(self): # <<<<<<<<<<<<<< - * """Iterate through corpus. + * """Iterate through all documents in the corpus. * */ @@ -4102,17 +4138,17 @@ static PyObject *__pyx_gb_6gensim_7corpora_9_mmreader_8MmReader_10generator(__py return __pyx_r; } -/* "gensim/corpora/_mmreader.pyx":167 +/* "gensim/corpora/_mmreader.pyx":169 * yield previd, [] * * def docbyoffset(self, offset): # <<<<<<<<<<<<<< - * """Get document at file offset `offset` (in bytes). + * """Get the document at file offset `offset` (in bytes). * */ /* Python wrapper */ static PyObject *__pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_12docbyoffset(PyObject *__pyx_v_self, PyObject *__pyx_v_offset); /*proto*/ -static char __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset[] = "MmReader.docbyoffset(self, offset)\nGet document at file offset `offset` (in bytes).\n\n Parameters\n ----------\n offset : int\n Offset, in bytes, of desired document.\n\n Returns\n ------\n list of (int, str)\n Document in BoW format.\n\n "; +static char __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset[] = "MmReader.docbyoffset(self, offset)\nGet the document at file offset `offset` (in bytes).\n\n Parameters\n ----------\n offset : int\n File offset, in bytes, of the desired document.\n\n Returns\n ------\n list of (int, str)\n Document in sparse bag-of-words format.\n\n "; static PyObject *__pyx_pw_6gensim_7corpora_9_mmreader_8MmReader_12docbyoffset(PyObject *__pyx_v_self, PyObject *__pyx_v_offset) { PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations @@ -4150,20 +4186,20 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st int __pyx_t_13; __Pyx_RefNannySetupContext("docbyoffset", 0); - /* "gensim/corpora/_mmreader.pyx":186 + /* "gensim/corpora/_mmreader.pyx":188 * cdef double val * * if offset == -1: # <<<<<<<<<<<<<< * return [] * if isinstance(self.input, string_types): */ - __pyx_t_1 = __Pyx_PyInt_EqObjC(__pyx_v_offset, __pyx_int_neg_1, -1L, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 186, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyInt_EqObjC(__pyx_v_offset, __pyx_int_neg_1, -1L, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 188, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 186, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 188, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; if (__pyx_t_2) { - /* "gensim/corpora/_mmreader.pyx":187 + /* "gensim/corpora/_mmreader.pyx":189 * * if offset == -1: * return [] # <<<<<<<<<<<<<< @@ -4171,13 +4207,13 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st * fin, close_fin = utils.smart_open(self.input), True */ __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 187, __pyx_L1_error) + __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 189, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_r = __pyx_t_1; __pyx_t_1 = 0; goto __pyx_L0; - /* "gensim/corpora/_mmreader.pyx":186 + /* "gensim/corpora/_mmreader.pyx":188 * cdef double val * * if offset == -1: # <<<<<<<<<<<<<< @@ -4186,7 +4222,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st */ } - /* "gensim/corpora/_mmreader.pyx":188 + /* "gensim/corpora/_mmreader.pyx":190 * if offset == -1: * return [] * if isinstance(self.input, string_types): # <<<<<<<<<<<<<< @@ -4195,24 +4231,24 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st */ __pyx_t_1 = __pyx_v_self->input; __Pyx_INCREF(__pyx_t_1); - __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_string_types); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 188, __pyx_L1_error) + __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_string_types); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 190, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_2 = PyObject_IsInstance(__pyx_t_1, __pyx_t_3); if (unlikely(__pyx_t_2 == ((int)-1))) __PYX_ERR(0, 188, __pyx_L1_error) + __pyx_t_2 = PyObject_IsInstance(__pyx_t_1, __pyx_t_3); if (unlikely(__pyx_t_2 == ((int)-1))) __PYX_ERR(0, 190, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; __pyx_t_4 = (__pyx_t_2 != 0); if (__pyx_t_4) { - /* "gensim/corpora/_mmreader.pyx":189 + /* "gensim/corpora/_mmreader.pyx":191 * return [] * if isinstance(self.input, string_types): * fin, close_fin = utils.smart_open(self.input), True # <<<<<<<<<<<<<< * else: * fin, close_fin = self.input, False */ - __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_utils); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 189, __pyx_L1_error) + __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_utils); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 191, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_smart_open); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 189, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_smart_open); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 191, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_t_1 = NULL; @@ -4226,13 +4262,13 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st } } if (!__pyx_t_1) { - __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_v_self->input); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 189, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_v_self->input); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 191, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_5)) { PyObject *__pyx_temp[2] = {__pyx_t_1, __pyx_v_self->input}; - __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 189, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 191, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_GOTREF(__pyx_t_3); } else @@ -4240,19 +4276,19 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_5)) { PyObject *__pyx_temp[2] = {__pyx_t_1, __pyx_v_self->input}; - __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 189, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 191, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_GOTREF(__pyx_t_3); } else #endif { - __pyx_t_6 = PyTuple_New(1+1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 189, __pyx_L1_error) + __pyx_t_6 = PyTuple_New(1+1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 191, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_6); __Pyx_GIVEREF(__pyx_t_1); PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_1); __pyx_t_1 = NULL; __Pyx_INCREF(__pyx_v_self->input); __Pyx_GIVEREF(__pyx_v_self->input); PyTuple_SET_ITEM(__pyx_t_6, 0+1, __pyx_v_self->input); - __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_6, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 189, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_6, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 191, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; } @@ -4263,7 +4299,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __pyx_t_3 = 0; __pyx_v_close_fin = __pyx_t_4; - /* "gensim/corpora/_mmreader.pyx":188 + /* "gensim/corpora/_mmreader.pyx":190 * if offset == -1: * return [] * if isinstance(self.input, string_types): # <<<<<<<<<<<<<< @@ -4273,7 +4309,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st goto __pyx_L4; } - /* "gensim/corpora/_mmreader.pyx":191 + /* "gensim/corpora/_mmreader.pyx":193 * fin, close_fin = utils.smart_open(self.input), True * else: * fin, close_fin = self.input, False # <<<<<<<<<<<<<< @@ -4290,14 +4326,14 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st } __pyx_L4:; - /* "gensim/corpora/_mmreader.pyx":193 + /* "gensim/corpora/_mmreader.pyx":195 * fin, close_fin = self.input, False * * fin.seek(offset) # works for gzip/bz2 input, too # <<<<<<<<<<<<<< * previd, document = -1, [] * for line in fin: */ - __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_v_fin, __pyx_n_s_seek); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 193, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_v_fin, __pyx_n_s_seek); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 195, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __pyx_t_6 = NULL; if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_5))) { @@ -4310,13 +4346,13 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st } } if (!__pyx_t_6) { - __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_v_offset); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 193, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_v_offset); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 195, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_5)) { PyObject *__pyx_temp[2] = {__pyx_t_6, __pyx_v_offset}; - __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 193, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 195, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0; __Pyx_GOTREF(__pyx_t_3); } else @@ -4324,19 +4360,19 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_5)) { PyObject *__pyx_temp[2] = {__pyx_t_6, __pyx_v_offset}; - __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 193, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_5, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 195, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0; __Pyx_GOTREF(__pyx_t_3); } else #endif { - __pyx_t_1 = PyTuple_New(1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 193, __pyx_L1_error) + __pyx_t_1 = PyTuple_New(1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 195, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_GIVEREF(__pyx_t_6); PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_6); __pyx_t_6 = NULL; __Pyx_INCREF(__pyx_v_offset); __Pyx_GIVEREF(__pyx_v_offset); PyTuple_SET_ITEM(__pyx_t_1, 0+1, __pyx_v_offset); - __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_1, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 193, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_1, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 195, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; } @@ -4344,7 +4380,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - /* "gensim/corpora/_mmreader.pyx":194 + /* "gensim/corpora/_mmreader.pyx":196 * * fin.seek(offset) # works for gzip/bz2 input, too * previd, document = -1, [] # <<<<<<<<<<<<<< @@ -4352,13 +4388,13 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st * if (sscanf(line, "%lld %lld %lg", &docid, &termid, &val) != 3): */ __pyx_t_7 = -1LL; - __pyx_t_3 = PyList_New(0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 194, __pyx_L1_error) + __pyx_t_3 = PyList_New(0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 196, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __pyx_v_previd = __pyx_t_7; __pyx_v_document = ((PyObject*)__pyx_t_3); __pyx_t_3 = 0; - /* "gensim/corpora/_mmreader.pyx":195 + /* "gensim/corpora/_mmreader.pyx":197 * fin.seek(offset) # works for gzip/bz2 input, too * previd, document = -1, [] * for line in fin: # <<<<<<<<<<<<<< @@ -4369,26 +4405,26 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __pyx_t_3 = __pyx_v_fin; __Pyx_INCREF(__pyx_t_3); __pyx_t_8 = 0; __pyx_t_9 = NULL; } else { - __pyx_t_8 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_v_fin); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 195, __pyx_L1_error) + __pyx_t_8 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_v_fin); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 197, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_9 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 195, __pyx_L1_error) + __pyx_t_9 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_9)) __PYX_ERR(0, 197, __pyx_L1_error) } for (;;) { if (likely(!__pyx_t_9)) { if (likely(PyList_CheckExact(__pyx_t_3))) { if (__pyx_t_8 >= PyList_GET_SIZE(__pyx_t_3)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_5 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_8); __Pyx_INCREF(__pyx_t_5); __pyx_t_8++; if (unlikely(0 < 0)) __PYX_ERR(0, 195, __pyx_L1_error) + __pyx_t_5 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_8); __Pyx_INCREF(__pyx_t_5); __pyx_t_8++; if (unlikely(0 < 0)) __PYX_ERR(0, 197, __pyx_L1_error) #else - __pyx_t_5 = PySequence_ITEM(__pyx_t_3, __pyx_t_8); __pyx_t_8++; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 195, __pyx_L1_error) + __pyx_t_5 = PySequence_ITEM(__pyx_t_3, __pyx_t_8); __pyx_t_8++; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 197, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); #endif } else { if (__pyx_t_8 >= PyTuple_GET_SIZE(__pyx_t_3)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_5 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_8); __Pyx_INCREF(__pyx_t_5); __pyx_t_8++; if (unlikely(0 < 0)) __PYX_ERR(0, 195, __pyx_L1_error) + __pyx_t_5 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_8); __Pyx_INCREF(__pyx_t_5); __pyx_t_8++; if (unlikely(0 < 0)) __PYX_ERR(0, 197, __pyx_L1_error) #else - __pyx_t_5 = PySequence_ITEM(__pyx_t_3, __pyx_t_8); __pyx_t_8++; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 195, __pyx_L1_error) + __pyx_t_5 = PySequence_ITEM(__pyx_t_3, __pyx_t_8); __pyx_t_8++; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 197, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); #endif } @@ -4398,7 +4434,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else __PYX_ERR(0, 195, __pyx_L1_error) + else __PYX_ERR(0, 197, __pyx_L1_error) } break; } @@ -4407,25 +4443,25 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __Pyx_XDECREF_SET(__pyx_v_line, __pyx_t_5); __pyx_t_5 = 0; - /* "gensim/corpora/_mmreader.pyx":196 + /* "gensim/corpora/_mmreader.pyx":198 * previd, document = -1, [] * for line in fin: * if (sscanf(line, "%lld %lld %lg", &docid, &termid, &val) != 3): # <<<<<<<<<<<<<< * raise ValueError("unable to parse line: {}".format(line)) * */ - __pyx_t_10 = __Pyx_PyObject_AsString(__pyx_v_line); if (unlikely((!__pyx_t_10) && PyErr_Occurred())) __PYX_ERR(0, 196, __pyx_L1_error) + __pyx_t_10 = __Pyx_PyObject_AsString(__pyx_v_line); if (unlikely((!__pyx_t_10) && PyErr_Occurred())) __PYX_ERR(0, 198, __pyx_L1_error) __pyx_t_4 = ((sscanf(__pyx_t_10, ((char const *)"%lld %lld %lg"), (&__pyx_v_docid), (&__pyx_v_termid), (&__pyx_v_val)) != 3) != 0); - if (__pyx_t_4) { + if (unlikely(__pyx_t_4)) { - /* "gensim/corpora/_mmreader.pyx":197 + /* "gensim/corpora/_mmreader.pyx":199 * for line in fin: * if (sscanf(line, "%lld %lld %lg", &docid, &termid, &val) != 3): * raise ValueError("unable to parse line: {}".format(line)) # <<<<<<<<<<<<<< * * if not self.transposed: */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_kp_s_unable_to_parse_line, __pyx_n_s_format); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 197, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_kp_s_unable_to_parse_line, __pyx_n_s_format); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 199, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_6 = NULL; if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_1))) { @@ -4438,13 +4474,13 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st } } if (!__pyx_t_6) { - __pyx_t_5 = __Pyx_PyObject_CallOneArg(__pyx_t_1, __pyx_v_line); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 197, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyObject_CallOneArg(__pyx_t_1, __pyx_v_line); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 199, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); } else { #if CYTHON_FAST_PYCALL if (PyFunction_Check(__pyx_t_1)) { PyObject *__pyx_temp[2] = {__pyx_t_6, __pyx_v_line}; - __pyx_t_5 = __Pyx_PyFunction_FastCall(__pyx_t_1, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 197, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyFunction_FastCall(__pyx_t_1, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 199, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0; __Pyx_GOTREF(__pyx_t_5); } else @@ -4452,37 +4488,32 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st #if CYTHON_FAST_PYCCALL if (__Pyx_PyFastCFunction_Check(__pyx_t_1)) { PyObject *__pyx_temp[2] = {__pyx_t_6, __pyx_v_line}; - __pyx_t_5 = __Pyx_PyCFunction_FastCall(__pyx_t_1, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 197, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyCFunction_FastCall(__pyx_t_1, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 199, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_6); __pyx_t_6 = 0; __Pyx_GOTREF(__pyx_t_5); } else #endif { - __pyx_t_11 = PyTuple_New(1+1); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 197, __pyx_L1_error) + __pyx_t_11 = PyTuple_New(1+1); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 199, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_11); __Pyx_GIVEREF(__pyx_t_6); PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_t_6); __pyx_t_6 = NULL; __Pyx_INCREF(__pyx_v_line); __Pyx_GIVEREF(__pyx_v_line); PyTuple_SET_ITEM(__pyx_t_11, 0+1, __pyx_v_line); - __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_11, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 197, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_t_11, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 199, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; } } __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 197, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_builtin_ValueError, __pyx_t_5); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 199, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __Pyx_GIVEREF(__pyx_t_5); - PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_5); - __pyx_t_5 = 0; - __pyx_t_5 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_t_1, NULL); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 197, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_5); - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __Pyx_Raise(__pyx_t_5, 0, 0, 0); __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; - __PYX_ERR(0, 197, __pyx_L1_error) + __Pyx_Raise(__pyx_t_1, 0, 0, 0); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __PYX_ERR(0, 199, __pyx_L1_error) - /* "gensim/corpora/_mmreader.pyx":196 + /* "gensim/corpora/_mmreader.pyx":198 * previd, document = -1, [] * for line in fin: * if (sscanf(line, "%lld %lld %lg", &docid, &termid, &val) != 3): # <<<<<<<<<<<<<< @@ -4491,7 +4522,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st */ } - /* "gensim/corpora/_mmreader.pyx":199 + /* "gensim/corpora/_mmreader.pyx":201 * raise ValueError("unable to parse line: {}".format(line)) * * if not self.transposed: # <<<<<<<<<<<<<< @@ -4501,7 +4532,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __pyx_t_4 = ((!(__pyx_v_self->transposed != 0)) != 0); if (__pyx_t_4) { - /* "gensim/corpora/_mmreader.pyx":200 + /* "gensim/corpora/_mmreader.pyx":202 * * if not self.transposed: * termid, docid = docid, termid # <<<<<<<<<<<<<< @@ -4513,7 +4544,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __pyx_v_termid = __pyx_t_7; __pyx_v_docid = __pyx_t_12; - /* "gensim/corpora/_mmreader.pyx":199 + /* "gensim/corpora/_mmreader.pyx":201 * raise ValueError("unable to parse line: {}".format(line)) * * if not self.transposed: # <<<<<<<<<<<<<< @@ -4522,7 +4553,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st */ } - /* "gensim/corpora/_mmreader.pyx":203 + /* "gensim/corpora/_mmreader.pyx":205 * * # -1 because matrix market indexes are 1-based => convert to 0-based * docid -= 1 # <<<<<<<<<<<<<< @@ -4531,7 +4562,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st */ __pyx_v_docid = (__pyx_v_docid - 1); - /* "gensim/corpora/_mmreader.pyx":204 + /* "gensim/corpora/_mmreader.pyx":206 * # -1 because matrix market indexes are 1-based => convert to 0-based * docid -= 1 * termid -= 1 # <<<<<<<<<<<<<< @@ -4540,7 +4571,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st */ __pyx_v_termid = (__pyx_v_termid - 1); - /* "gensim/corpora/_mmreader.pyx":206 + /* "gensim/corpora/_mmreader.pyx":208 * termid -= 1 * * assert previd <= docid, "matrix columns must come in ascending order" # <<<<<<<<<<<<<< @@ -4551,12 +4582,12 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st if (unlikely(!Py_OptimizeFlag)) { if (unlikely(!((__pyx_v_previd <= __pyx_v_docid) != 0))) { PyErr_SetObject(PyExc_AssertionError, __pyx_kp_s_matrix_columns_must_come_in_asce); - __PYX_ERR(0, 206, __pyx_L1_error) + __PYX_ERR(0, 208, __pyx_L1_error) } } #endif - /* "gensim/corpora/_mmreader.pyx":207 + /* "gensim/corpora/_mmreader.pyx":209 * * assert previd <= docid, "matrix columns must come in ascending order" * if docid != previd: # <<<<<<<<<<<<<< @@ -4566,7 +4597,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __pyx_t_4 = ((__pyx_v_docid != __pyx_v_previd) != 0); if (__pyx_t_4) { - /* "gensim/corpora/_mmreader.pyx":208 + /* "gensim/corpora/_mmreader.pyx":210 * assert previd <= docid, "matrix columns must come in ascending order" * if docid != previd: * if previd >= 0: # <<<<<<<<<<<<<< @@ -4576,7 +4607,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __pyx_t_4 = ((__pyx_v_previd >= 0) != 0); if (__pyx_t_4) { - /* "gensim/corpora/_mmreader.pyx":209 + /* "gensim/corpora/_mmreader.pyx":211 * if docid != previd: * if previd >= 0: * break # <<<<<<<<<<<<<< @@ -4585,7 +4616,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st */ goto __pyx_L6_break; - /* "gensim/corpora/_mmreader.pyx":208 + /* "gensim/corpora/_mmreader.pyx":210 * assert previd <= docid, "matrix columns must come in ascending order" * if docid != previd: * if previd >= 0: # <<<<<<<<<<<<<< @@ -4594,7 +4625,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st */ } - /* "gensim/corpora/_mmreader.pyx":210 + /* "gensim/corpora/_mmreader.pyx":212 * if previd >= 0: * break * previd = docid # <<<<<<<<<<<<<< @@ -4603,7 +4634,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st */ __pyx_v_previd = __pyx_v_docid; - /* "gensim/corpora/_mmreader.pyx":207 + /* "gensim/corpora/_mmreader.pyx":209 * * assert previd <= docid, "matrix columns must come in ascending order" * if docid != previd: # <<<<<<<<<<<<<< @@ -4612,29 +4643,29 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st */ } - /* "gensim/corpora/_mmreader.pyx":212 + /* "gensim/corpora/_mmreader.pyx":214 * previd = docid * * document.append((termid, val,)) # add another field to the current document # <<<<<<<<<<<<<< * * if close_fin: */ - __pyx_t_5 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_v_termid); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 212, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_5); - __pyx_t_1 = PyFloat_FromDouble(__pyx_v_val); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 212, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_v_termid); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 214, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_11 = PyTuple_New(2); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 212, __pyx_L1_error) + __pyx_t_5 = PyFloat_FromDouble(__pyx_v_val); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 214, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __pyx_t_11 = PyTuple_New(2); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 214, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_11); - __Pyx_GIVEREF(__pyx_t_5); - PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_t_5); __Pyx_GIVEREF(__pyx_t_1); - PyTuple_SET_ITEM(__pyx_t_11, 1, __pyx_t_1); - __pyx_t_5 = 0; + PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_t_1); + __Pyx_GIVEREF(__pyx_t_5); + PyTuple_SET_ITEM(__pyx_t_11, 1, __pyx_t_5); __pyx_t_1 = 0; - __pyx_t_13 = __Pyx_PyList_Append(__pyx_v_document, __pyx_t_11); if (unlikely(__pyx_t_13 == ((int)-1))) __PYX_ERR(0, 212, __pyx_L1_error) + __pyx_t_5 = 0; + __pyx_t_13 = __Pyx_PyList_Append(__pyx_v_document, __pyx_t_11); if (unlikely(__pyx_t_13 == ((int)-1))) __PYX_ERR(0, 214, __pyx_L1_error) __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; - /* "gensim/corpora/_mmreader.pyx":195 + /* "gensim/corpora/_mmreader.pyx":197 * fin.seek(offset) # works for gzip/bz2 input, too * previd, document = -1, [] * for line in fin: # <<<<<<<<<<<<<< @@ -4645,7 +4676,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __pyx_L6_break:; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - /* "gensim/corpora/_mmreader.pyx":214 + /* "gensim/corpora/_mmreader.pyx":216 * document.append((termid, val,)) # add another field to the current document * * if close_fin: # <<<<<<<<<<<<<< @@ -4655,35 +4686,35 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __pyx_t_4 = (__pyx_v_close_fin != 0); if (__pyx_t_4) { - /* "gensim/corpora/_mmreader.pyx":215 + /* "gensim/corpora/_mmreader.pyx":217 * * if close_fin: * fin.close() # <<<<<<<<<<<<<< * return document */ - __pyx_t_11 = __Pyx_PyObject_GetAttrStr(__pyx_v_fin, __pyx_n_s_close); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 215, __pyx_L1_error) + __pyx_t_11 = __Pyx_PyObject_GetAttrStr(__pyx_v_fin, __pyx_n_s_close); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 217, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_11); - __pyx_t_1 = NULL; + __pyx_t_5 = NULL; if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_11))) { - __pyx_t_1 = PyMethod_GET_SELF(__pyx_t_11); - if (likely(__pyx_t_1)) { + __pyx_t_5 = PyMethod_GET_SELF(__pyx_t_11); + if (likely(__pyx_t_5)) { PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_11); - __Pyx_INCREF(__pyx_t_1); + __Pyx_INCREF(__pyx_t_5); __Pyx_INCREF(function); __Pyx_DECREF_SET(__pyx_t_11, function); } } - if (__pyx_t_1) { - __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_11, __pyx_t_1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 215, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + if (__pyx_t_5) { + __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_11, __pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 217, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; } else { - __pyx_t_3 = __Pyx_PyObject_CallNoArg(__pyx_t_11); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 215, __pyx_L1_error) + __pyx_t_3 = __Pyx_PyObject_CallNoArg(__pyx_t_11); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 217, __pyx_L1_error) } __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - /* "gensim/corpora/_mmreader.pyx":214 + /* "gensim/corpora/_mmreader.pyx":216 * document.append((termid, val,)) # add another field to the current document * * if close_fin: # <<<<<<<<<<<<<< @@ -4692,7 +4723,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st */ } - /* "gensim/corpora/_mmreader.pyx":216 + /* "gensim/corpora/_mmreader.pyx":218 * if close_fin: * fin.close() * return document # <<<<<<<<<<<<<< @@ -4702,11 +4733,11 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __pyx_r = __pyx_v_document; goto __pyx_L0; - /* "gensim/corpora/_mmreader.pyx":167 + /* "gensim/corpora/_mmreader.pyx":169 * yield previd, [] * * def docbyoffset(self, offset): # <<<<<<<<<<<<<< - * """Get document at file offset `offset` (in bytes). + * """Get the document at file offset `offset` (in bytes). * */ @@ -4728,7 +4759,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st return __pyx_r; } -/* "gensim/corpora/_mmreader.pyx":41 +/* "gensim/corpora/_mmreader.pyx":43 * * """ * cdef public input # <<<<<<<<<<<<<< @@ -4823,7 +4854,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_5input_4__del__(struct return __pyx_r; } -/* "gensim/corpora/_mmreader.pyx":42 +/* "gensim/corpora/_mmreader.pyx":44 * """ * cdef public input * cdef public bint transposed # <<<<<<<<<<<<<< @@ -4850,7 +4881,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_10transposed___g PyObject *__pyx_t_1 = NULL; __Pyx_RefNannySetupContext("__get__", 0); __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = __Pyx_PyBool_FromLong(__pyx_v_self->transposed); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 42, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyBool_FromLong(__pyx_v_self->transposed); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 44, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_r = __pyx_t_1; __pyx_t_1 = 0; @@ -4885,7 +4916,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_10transposed_2__set__( __Pyx_RefNannyDeclarations int __pyx_t_1; __Pyx_RefNannySetupContext("__set__", 0); - __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_v_value); if (unlikely((__pyx_t_1 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 42, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_IsTrue(__pyx_v_value); if (unlikely((__pyx_t_1 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 44, __pyx_L1_error) __pyx_v_self->transposed = __pyx_t_1; /* function exit code */ @@ -4899,7 +4930,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_10transposed_2__set__( return __pyx_r; } -/* "gensim/corpora/_mmreader.pyx":43 +/* "gensim/corpora/_mmreader.pyx":45 * cdef public input * cdef public bint transposed * cdef public long long num_docs, num_terms, num_nnz # <<<<<<<<<<<<<< @@ -4926,7 +4957,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_8num_docs___get_ PyObject *__pyx_t_1 = NULL; __Pyx_RefNannySetupContext("__get__", 0); __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_v_self->num_docs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 43, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_v_self->num_docs); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 45, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_r = __pyx_t_1; __pyx_t_1 = 0; @@ -4961,7 +4992,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_8num_docs_2__set__(str __Pyx_RefNannyDeclarations PY_LONG_LONG __pyx_t_1; __Pyx_RefNannySetupContext("__set__", 0); - __pyx_t_1 = __Pyx_PyInt_As_PY_LONG_LONG(__pyx_v_value); if (unlikely((__pyx_t_1 == (PY_LONG_LONG)-1) && PyErr_Occurred())) __PYX_ERR(0, 43, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyInt_As_PY_LONG_LONG(__pyx_v_value); if (unlikely((__pyx_t_1 == (PY_LONG_LONG)-1) && PyErr_Occurred())) __PYX_ERR(0, 45, __pyx_L1_error) __pyx_v_self->num_docs = __pyx_t_1; /* function exit code */ @@ -4994,7 +5025,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_9num_terms___get PyObject *__pyx_t_1 = NULL; __Pyx_RefNannySetupContext("__get__", 0); __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_v_self->num_terms); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 43, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_v_self->num_terms); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 45, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_r = __pyx_t_1; __pyx_t_1 = 0; @@ -5029,7 +5060,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_9num_terms_2__set__(st __Pyx_RefNannyDeclarations PY_LONG_LONG __pyx_t_1; __Pyx_RefNannySetupContext("__set__", 0); - __pyx_t_1 = __Pyx_PyInt_As_PY_LONG_LONG(__pyx_v_value); if (unlikely((__pyx_t_1 == (PY_LONG_LONG)-1) && PyErr_Occurred())) __PYX_ERR(0, 43, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyInt_As_PY_LONG_LONG(__pyx_v_value); if (unlikely((__pyx_t_1 == (PY_LONG_LONG)-1) && PyErr_Occurred())) __PYX_ERR(0, 45, __pyx_L1_error) __pyx_v_self->num_terms = __pyx_t_1; /* function exit code */ @@ -5062,7 +5093,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_7num_nnz___get__ PyObject *__pyx_t_1 = NULL; __Pyx_RefNannySetupContext("__get__", 0); __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_v_self->num_nnz); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 43, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_v_self->num_nnz); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 45, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_r = __pyx_t_1; __pyx_t_1 = 0; @@ -5097,7 +5128,7 @@ static int __pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_7num_nnz_2__set__(stru __Pyx_RefNannyDeclarations PY_LONG_LONG __pyx_t_1; __Pyx_RefNannySetupContext("__set__", 0); - __pyx_t_1 = __Pyx_PyInt_As_PY_LONG_LONG(__pyx_v_value); if (unlikely((__pyx_t_1 == (PY_LONG_LONG)-1) && PyErr_Occurred())) __PYX_ERR(0, 43, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyInt_As_PY_LONG_LONG(__pyx_v_value); if (unlikely((__pyx_t_1 == (PY_LONG_LONG)-1) && PyErr_Occurred())) __PYX_ERR(0, 45, __pyx_L1_error) __pyx_v_self->num_nnz = __pyx_t_1; /* function exit code */ @@ -5460,17 +5491,17 @@ static PyObject *__pyx_pw_6gensim_7corpora_9_mmreader_1__pyx_unpickle_MmReader(P kw_args = PyDict_Size(__pyx_kwds); switch (pos_args) { case 0: - if (likely((values[0] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_pyx_type)) != 0)) kw_args--; + if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_pyx_type)) != 0)) kw_args--; else goto __pyx_L5_argtuple_error; CYTHON_FALLTHROUGH; case 1: - if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_pyx_checksum)) != 0)) kw_args--; + if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_pyx_checksum)) != 0)) kw_args--; else { __Pyx_RaiseArgtupleInvalid("__pyx_unpickle_MmReader", 1, 3, 3, 1); __PYX_ERR(1, 1, __pyx_L3_error) } CYTHON_FALLTHROUGH; case 2: - if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_pyx_state)) != 0)) kw_args--; + if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_pyx_state)) != 0)) kw_args--; else { __Pyx_RaiseArgtupleInvalid("__pyx_unpickle_MmReader", 1, 3, 3, 2); __PYX_ERR(1, 1, __pyx_L3_error) } @@ -6115,7 +6146,7 @@ static PyTypeObject __pyx_type_6gensim_7corpora_9_mmreader_MmReader = { 0, /*tp_setattro*/ 0, /*tp_as_buffer*/ Py_TPFLAGS_DEFAULT|Py_TPFLAGS_HAVE_VERSION_TAG|Py_TPFLAGS_CHECKTYPES|Py_TPFLAGS_HAVE_NEWBUFFER|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_HAVE_GC, /*tp_flags*/ - "MmReader(input, transposed=True)\nMatrix market file reader (fast Cython version), used for :class:`~gensim.corpora.mmcorpus.MmCorpus`.\n\n Wrap a term-document matrix on disk (in matrix-market format), and present it\n as an object which supports iteration over the rows (~documents).\n\n Attributes\n ----------\n num_docs : int\n Number of documents in market matrix file.\n num_terms : int\n Number of terms.\n num_nnz : int\n Number of non-zero terms.\n\n Notes\n ----------\n Note that the file is read into memory one document at a time, not the whole\n matrix at once (unlike scipy.io.mmread). This allows us to process corpora\n which are larger than the available RAM.\n\n ", /*tp_doc*/ + "MmReader(input, transposed=True)\nMatrix market file reader (fast Cython version), used internally in :class:`~gensim.corpora.mmcorpus.MmCorpus`.\n\n Wrap a term-document matrix on disk (in matrix-market format), and present it\n as an object which supports iteration over the rows (~documents).\n\n Attributes\n ----------\n num_docs : int\n Number of documents in market matrix file.\n num_terms : int\n Number of terms.\n num_nnz : int\n Number of non-zero terms.\n\n Notes\n -----\n Note that the file is read into memory one document at a time, not the whole matrix at once\n (unlike e.g. `scipy.io.mmread` and other implementations).\n This allows us to process corpora which are larger than the available RAM.\n\n ", /*tp_doc*/ __pyx_tp_traverse_6gensim_7corpora_9_mmreader_MmReader, /*tp_traverse*/ __pyx_tp_clear_6gensim_7corpora_9_mmreader_MmReader, /*tp_clear*/ 0, /*tp_richcompare*/ @@ -6595,9 +6626,9 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = { {0, 0, 0, 0, 0, 0, 0} }; static int __Pyx_InitCachedBuiltins(void) { - __pyx_builtin_ValueError = __Pyx_GetBuiltinName(__pyx_n_s_ValueError); if (!__pyx_builtin_ValueError) __PYX_ERR(0, 64, __pyx_L1_error) - __pyx_builtin_StopIteration = __Pyx_GetBuiltinName(__pyx_n_s_StopIteration); if (!__pyx_builtin_StopIteration) __PYX_ERR(0, 68, __pyx_L1_error) - __pyx_builtin_enumerate = __Pyx_GetBuiltinName(__pyx_n_s_enumerate); if (!__pyx_builtin_enumerate) __PYX_ERR(0, 72, __pyx_L1_error) + __pyx_builtin_ValueError = __Pyx_GetBuiltinName(__pyx_n_s_ValueError); if (!__pyx_builtin_ValueError) __PYX_ERR(0, 66, __pyx_L1_error) + __pyx_builtin_StopIteration = __Pyx_GetBuiltinName(__pyx_n_s_StopIteration); if (!__pyx_builtin_StopIteration) __PYX_ERR(0, 70, __pyx_L1_error) + __pyx_builtin_enumerate = __Pyx_GetBuiltinName(__pyx_n_s_enumerate); if (!__pyx_builtin_enumerate) __PYX_ERR(0, 74, __pyx_L1_error) return 0; __pyx_L1_error:; return -1; @@ -6607,58 +6638,58 @@ static int __Pyx_InitCachedConstants(void) { __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0); - /* "gensim/corpora/_mmreader.pyx":63 + /* "gensim/corpora/_mmreader.pyx":65 * try: * header = utils.to_unicode(next(lines)).strip() * if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): # <<<<<<<<<<<<<< * raise ValueError( * "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % */ - __pyx_tuple_ = PyTuple_Pack(1, __pyx_kp_s_matrixmarket_matrix_coordinate); if (unlikely(!__pyx_tuple_)) __PYX_ERR(0, 63, __pyx_L1_error) + __pyx_tuple_ = PyTuple_Pack(1, __pyx_kp_s_matrixmarket_matrix_coordinate); if (unlikely(!__pyx_tuple_)) __PYX_ERR(0, 65, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple_); __Pyx_GIVEREF(__pyx_tuple_); - /* "gensim/corpora/_mmreader.pyx":74 + /* "gensim/corpora/_mmreader.pyx":76 * for lineno, line in enumerate(lines): * line = utils.to_unicode(line) * if not line.startswith('%'): # <<<<<<<<<<<<<< * self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split()) * if not self.transposed: */ - __pyx_tuple__3 = PyTuple_Pack(1, __pyx_kp_s__2); if (unlikely(!__pyx_tuple__3)) __PYX_ERR(0, 74, __pyx_L1_error) + __pyx_tuple__3 = PyTuple_Pack(1, __pyx_kp_s__2); if (unlikely(!__pyx_tuple__3)) __PYX_ERR(0, 76, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple__3); __Pyx_GIVEREF(__pyx_tuple__3); - /* "gensim/corpora/_mmreader.pyx":60 + /* "gensim/corpora/_mmreader.pyx":62 * logger.info("initializing cython corpus reader from %s", input) * self.input, self.transposed = input, transposed * with utils.open_file(self.input) as lines: # <<<<<<<<<<<<<< * try: * header = utils.to_unicode(next(lines)).strip() */ - __pyx_tuple__4 = PyTuple_Pack(3, Py_None, Py_None, Py_None); if (unlikely(!__pyx_tuple__4)) __PYX_ERR(0, 60, __pyx_L1_error) + __pyx_tuple__4 = PyTuple_Pack(3, Py_None, Py_None, Py_None); if (unlikely(!__pyx_tuple__4)) __PYX_ERR(0, 62, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple__4); __Pyx_GIVEREF(__pyx_tuple__4); - /* "gensim/corpora/_mmreader.pyx":103 + /* "gensim/corpora/_mmreader.pyx":105 * """ * for line in input_file: * if line.startswith(b'%'): # <<<<<<<<<<<<<< * continue * break */ - __pyx_tuple__5 = PyTuple_Pack(1, __pyx_kp_b__2); if (unlikely(!__pyx_tuple__5)) __PYX_ERR(0, 103, __pyx_L1_error) + __pyx_tuple__5 = PyTuple_Pack(1, __pyx_kp_b__2); if (unlikely(!__pyx_tuple__5)) __PYX_ERR(0, 105, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple__5); __Pyx_GIVEREF(__pyx_tuple__5); - /* "gensim/corpora/_mmreader.pyx":125 + /* "gensim/corpora/_mmreader.pyx":127 * cdef double val = 0 * * with utils.file_or_filename(self.input) as lines: # <<<<<<<<<<<<<< * self.skip_headers(lines) * */ - __pyx_tuple__6 = PyTuple_Pack(3, Py_None, Py_None, Py_None); if (unlikely(!__pyx_tuple__6)) __PYX_ERR(0, 125, __pyx_L1_error) + __pyx_tuple__6 = PyTuple_Pack(3, Py_None, Py_None, Py_None); if (unlikely(!__pyx_tuple__6)) __PYX_ERR(0, 127, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple__6); __Pyx_GIVEREF(__pyx_tuple__6); @@ -6689,12 +6720,160 @@ static int __Pyx_InitGlobals(void) { return -1; } +static int __Pyx_modinit_global_init_code(void); /*proto*/ +static int __Pyx_modinit_variable_export_code(void); /*proto*/ +static int __Pyx_modinit_function_export_code(void); /*proto*/ +static int __Pyx_modinit_type_init_code(void); /*proto*/ +static int __Pyx_modinit_type_import_code(void); /*proto*/ +static int __Pyx_modinit_variable_import_code(void); /*proto*/ +static int __Pyx_modinit_function_import_code(void); /*proto*/ + +static int __Pyx_modinit_global_init_code(void) { + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__Pyx_modinit_global_init_code", 0); + /*--- Global init code ---*/ + __Pyx_RefNannyFinishContext(); + return 0; +} + +static int __Pyx_modinit_variable_export_code(void) { + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__Pyx_modinit_variable_export_code", 0); + /*--- Variable export code ---*/ + __Pyx_RefNannyFinishContext(); + return 0; +} + +static int __Pyx_modinit_function_export_code(void) { + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__Pyx_modinit_function_export_code", 0); + /*--- Function export code ---*/ + __Pyx_RefNannyFinishContext(); + return 0; +} + +static int __Pyx_modinit_type_init_code(void) { + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__Pyx_modinit_type_init_code", 0); + /*--- Type init code ---*/ + if (PyType_Ready(&__pyx_type_6gensim_7corpora_9_mmreader_MmReader) < 0) __PYX_ERR(0, 21, __pyx_L1_error) + __pyx_type_6gensim_7corpora_9_mmreader_MmReader.tp_print = 0; + if ((CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP) && likely(!__pyx_type_6gensim_7corpora_9_mmreader_MmReader.tp_dictoffset && __pyx_type_6gensim_7corpora_9_mmreader_MmReader.tp_getattro == PyObject_GenericGetAttr)) { + __pyx_type_6gensim_7corpora_9_mmreader_MmReader.tp_getattro = __Pyx_PyObject_GenericGetAttr; + } + #if CYTHON_COMPILING_IN_CPYTHON + { + PyObject *wrapper = PyObject_GetAttrString((PyObject *)&__pyx_type_6gensim_7corpora_9_mmreader_MmReader, "__init__"); if (unlikely(!wrapper)) __PYX_ERR(0, 21, __pyx_L1_error) + if (Py_TYPE(wrapper) == &PyWrapperDescr_Type) { + __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader___init__ = *((PyWrapperDescrObject *)wrapper)->d_base; + __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader___init__.doc = __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader___init__; + ((PyWrapperDescrObject *)wrapper)->d_base = &__pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader___init__; + } + } + #endif + #if CYTHON_COMPILING_IN_CPYTHON + { + PyObject *wrapper = PyObject_GetAttrString((PyObject *)&__pyx_type_6gensim_7corpora_9_mmreader_MmReader, "__len__"); if (unlikely(!wrapper)) __PYX_ERR(0, 21, __pyx_L1_error) + if (Py_TYPE(wrapper) == &PyWrapperDescr_Type) { + __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader_2__len__ = *((PyWrapperDescrObject *)wrapper)->d_base; + __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader_2__len__.doc = __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_2__len__; + ((PyWrapperDescrObject *)wrapper)->d_base = &__pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader_2__len__; + } + } + #endif + #if CYTHON_COMPILING_IN_CPYTHON + { + PyObject *wrapper = PyObject_GetAttrString((PyObject *)&__pyx_type_6gensim_7corpora_9_mmreader_MmReader, "__iter__"); if (unlikely(!wrapper)) __PYX_ERR(0, 21, __pyx_L1_error) + if (Py_TYPE(wrapper) == &PyWrapperDescr_Type) { + __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader_8__iter__ = *((PyWrapperDescrObject *)wrapper)->d_base; + __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader_8__iter__.doc = __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_8__iter__; + ((PyWrapperDescrObject *)wrapper)->d_base = &__pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader_8__iter__; + } + } + #endif + if (PyObject_SetAttrString(__pyx_m, "MmReader", (PyObject *)&__pyx_type_6gensim_7corpora_9_mmreader_MmReader) < 0) __PYX_ERR(0, 21, __pyx_L1_error) + if (__Pyx_setup_reduce((PyObject*)&__pyx_type_6gensim_7corpora_9_mmreader_MmReader) < 0) __PYX_ERR(0, 21, __pyx_L1_error) + __pyx_ptype_6gensim_7corpora_9_mmreader_MmReader = &__pyx_type_6gensim_7corpora_9_mmreader_MmReader; + if (PyType_Ready(&__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct____init__) < 0) __PYX_ERR(0, 47, __pyx_L1_error) + __pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct____init__.tp_print = 0; + if ((CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP) && likely(!__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct____init__.tp_dictoffset && __pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct____init__.tp_getattro == PyObject_GenericGetAttr)) { + __pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct____init__.tp_getattro = __Pyx_PyObject_GenericGetAttrNoDict; + } + __pyx_ptype_6gensim_7corpora_9_mmreader___pyx_scope_struct____init__ = &__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct____init__; + if (PyType_Ready(&__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_1_genexpr) < 0) __PYX_ERR(0, 77, __pyx_L1_error) + __pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_1_genexpr.tp_print = 0; + if ((CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP) && likely(!__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_1_genexpr.tp_dictoffset && __pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_1_genexpr.tp_getattro == PyObject_GenericGetAttr)) { + __pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_1_genexpr.tp_getattro = __Pyx_PyObject_GenericGetAttrNoDict; + } + __pyx_ptype_6gensim_7corpora_9_mmreader___pyx_scope_struct_1_genexpr = &__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_1_genexpr; + if (PyType_Ready(&__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_2___iter__) < 0) __PYX_ERR(0, 109, __pyx_L1_error) + __pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_2___iter__.tp_print = 0; + if ((CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP) && likely(!__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_2___iter__.tp_dictoffset && __pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_2___iter__.tp_getattro == PyObject_GenericGetAttr)) { + __pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_2___iter__.tp_getattro = __Pyx_PyObject_GenericGetAttrNoDict; + } + __pyx_ptype_6gensim_7corpora_9_mmreader___pyx_scope_struct_2___iter__ = &__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_2___iter__; + __Pyx_RefNannyFinishContext(); + return 0; + __pyx_L1_error:; + __Pyx_RefNannyFinishContext(); + return -1; +} + +static int __Pyx_modinit_type_import_code(void) { + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__Pyx_modinit_type_import_code", 0); + /*--- Type import code ---*/ + __Pyx_RefNannyFinishContext(); + return 0; +} + +static int __Pyx_modinit_variable_import_code(void) { + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__Pyx_modinit_variable_import_code", 0); + /*--- Variable import code ---*/ + __Pyx_RefNannyFinishContext(); + return 0; +} + +static int __Pyx_modinit_function_import_code(void) { + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("__Pyx_modinit_function_import_code", 0); + /*--- Function import code ---*/ + __Pyx_RefNannyFinishContext(); + return 0; +} + + +#if PY_MAJOR_VERSION < 3 +#ifdef CYTHON_NO_PYINIT_EXPORT +#define __Pyx_PyMODINIT_FUNC void +#else +#define __Pyx_PyMODINIT_FUNC PyMODINIT_FUNC +#endif +#else +#ifdef CYTHON_NO_PYINIT_EXPORT +#define __Pyx_PyMODINIT_FUNC PyObject * +#else +#define __Pyx_PyMODINIT_FUNC PyMODINIT_FUNC +#endif +#endif +#ifndef CYTHON_SMALL_CODE +#if defined(__clang__) + #define CYTHON_SMALL_CODE +#elif defined(__GNUC__) && (!(defined(__cplusplus)) || (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 4))) + #define CYTHON_SMALL_CODE __attribute__((optimize("Os"))) +#else + #define CYTHON_SMALL_CODE +#endif +#endif + + #if PY_MAJOR_VERSION < 3 -PyMODINIT_FUNC init_mmreader(void); /*proto*/ -PyMODINIT_FUNC init_mmreader(void) +__Pyx_PyMODINIT_FUNC init_mmreader(void) CYTHON_SMALL_CODE; /*proto*/ +__Pyx_PyMODINIT_FUNC init_mmreader(void) #else -PyMODINIT_FUNC PyInit__mmreader(void); /*proto*/ -PyMODINIT_FUNC PyInit__mmreader(void) +__Pyx_PyMODINIT_FUNC PyInit__mmreader(void) CYTHON_SMALL_CODE; /*proto*/ +__Pyx_PyMODINIT_FUNC PyInit__mmreader(void) #if CYTHON_PEP489_MULTI_PHASE_INIT { return PyModuleDef_Init(&__pyx_moduledef); @@ -6741,22 +6920,22 @@ static int __pyx_pymod_exec__mmreader(PyObject *__pyx_pyinit_module) PyObject *__pyx_t_1 = NULL; PyObject *__pyx_t_2 = NULL; PyObject *__pyx_t_3 = NULL; - PyObject *__pyx_t_4 = NULL; - PyObject *__pyx_t_5 = NULL; __Pyx_RefNannyDeclarations #if CYTHON_PEP489_MULTI_PHASE_INIT if (__pyx_m && __pyx_m == __pyx_pyinit_module) return 0; + #elif PY_MAJOR_VERSION >= 3 + if (__pyx_m) return __Pyx_NewRef(__pyx_m); #endif #if CYTHON_REFNANNY - __Pyx_RefNanny = __Pyx_RefNannyImportAPI("refnanny"); - if (!__Pyx_RefNanny) { - PyErr_Clear(); - __Pyx_RefNanny = __Pyx_RefNannyImportAPI("Cython.Runtime.refnanny"); - if (!__Pyx_RefNanny) - Py_FatalError("failed to import 'refnanny' module"); - } - #endif - __Pyx_RefNannySetupContext("PyMODINIT_FUNC PyInit__mmreader(void)", 0); +__Pyx_RefNanny = __Pyx_RefNannyImportAPI("refnanny"); +if (!__Pyx_RefNanny) { + PyErr_Clear(); + __Pyx_RefNanny = __Pyx_RefNannyImportAPI("Cython.Runtime.refnanny"); + if (!__Pyx_RefNanny) + Py_FatalError("failed to import 'refnanny' module"); +} +#endif + __Pyx_RefNannySetupContext("__Pyx_PyMODINIT_FUNC PyInit__mmreader(void)", 0); if (__Pyx_check_binary_version() < 0) __PYX_ERR(0, 1, __pyx_L1_error) __pyx_empty_tuple = PyTuple_New(0); if (unlikely(!__pyx_empty_tuple)) __PYX_ERR(0, 1, __pyx_L1_error) __pyx_empty_bytes = PyBytes_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_bytes)) __PYX_ERR(0, 1, __pyx_L1_error) @@ -6826,219 +7005,134 @@ static int __pyx_pymod_exec__mmreader(PyObject *__pyx_pyinit_module) if (__Pyx_InitCachedBuiltins() < 0) __PYX_ERR(0, 1, __pyx_L1_error) /*--- Constants init code ---*/ if (__Pyx_InitCachedConstants() < 0) __PYX_ERR(0, 1, __pyx_L1_error) - /*--- Global init code ---*/ - /*--- Variable export code ---*/ - /*--- Function export code ---*/ - /*--- Type init code ---*/ - if (PyType_Ready(&__pyx_type_6gensim_7corpora_9_mmreader_MmReader) < 0) __PYX_ERR(0, 19, __pyx_L1_error) - __pyx_type_6gensim_7corpora_9_mmreader_MmReader.tp_print = 0; - #if CYTHON_COMPILING_IN_CPYTHON - { - PyObject *wrapper = PyObject_GetAttrString((PyObject *)&__pyx_type_6gensim_7corpora_9_mmreader_MmReader, "__init__"); if (unlikely(!wrapper)) __PYX_ERR(0, 19, __pyx_L1_error) - if (Py_TYPE(wrapper) == &PyWrapperDescr_Type) { - __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader___init__ = *((PyWrapperDescrObject *)wrapper)->d_base; - __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader___init__.doc = __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader___init__; - ((PyWrapperDescrObject *)wrapper)->d_base = &__pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader___init__; - } - } - #endif - #if CYTHON_COMPILING_IN_CPYTHON - { - PyObject *wrapper = PyObject_GetAttrString((PyObject *)&__pyx_type_6gensim_7corpora_9_mmreader_MmReader, "__len__"); if (unlikely(!wrapper)) __PYX_ERR(0, 19, __pyx_L1_error) - if (Py_TYPE(wrapper) == &PyWrapperDescr_Type) { - __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader_2__len__ = *((PyWrapperDescrObject *)wrapper)->d_base; - __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader_2__len__.doc = __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_2__len__; - ((PyWrapperDescrObject *)wrapper)->d_base = &__pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader_2__len__; - } - } - #endif - #if CYTHON_COMPILING_IN_CPYTHON - { - PyObject *wrapper = PyObject_GetAttrString((PyObject *)&__pyx_type_6gensim_7corpora_9_mmreader_MmReader, "__iter__"); if (unlikely(!wrapper)) __PYX_ERR(0, 19, __pyx_L1_error) - if (Py_TYPE(wrapper) == &PyWrapperDescr_Type) { - __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader_8__iter__ = *((PyWrapperDescrObject *)wrapper)->d_base; - __pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader_8__iter__.doc = __pyx_doc_6gensim_7corpora_9_mmreader_8MmReader_8__iter__; - ((PyWrapperDescrObject *)wrapper)->d_base = &__pyx_wrapperbase_6gensim_7corpora_9_mmreader_8MmReader_8__iter__; - } - } - #endif - if (PyObject_SetAttrString(__pyx_m, "MmReader", (PyObject *)&__pyx_type_6gensim_7corpora_9_mmreader_MmReader) < 0) __PYX_ERR(0, 19, __pyx_L1_error) - if (__Pyx_setup_reduce((PyObject*)&__pyx_type_6gensim_7corpora_9_mmreader_MmReader) < 0) __PYX_ERR(0, 19, __pyx_L1_error) - __pyx_ptype_6gensim_7corpora_9_mmreader_MmReader = &__pyx_type_6gensim_7corpora_9_mmreader_MmReader; - if (PyType_Ready(&__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct____init__) < 0) __PYX_ERR(0, 45, __pyx_L1_error) - __pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct____init__.tp_print = 0; - __pyx_ptype_6gensim_7corpora_9_mmreader___pyx_scope_struct____init__ = &__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct____init__; - if (PyType_Ready(&__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_1_genexpr) < 0) __PYX_ERR(0, 75, __pyx_L1_error) - __pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_1_genexpr.tp_print = 0; - __pyx_ptype_6gensim_7corpora_9_mmreader___pyx_scope_struct_1_genexpr = &__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_1_genexpr; - if (PyType_Ready(&__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_2___iter__) < 0) __PYX_ERR(0, 107, __pyx_L1_error) - __pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_2___iter__.tp_print = 0; - __pyx_ptype_6gensim_7corpora_9_mmreader___pyx_scope_struct_2___iter__ = &__pyx_type_6gensim_7corpora_9_mmreader___pyx_scope_struct_2___iter__; - /*--- Type import code ---*/ - /*--- Variable import code ---*/ - /*--- Function import code ---*/ + /*--- Global type/function init code ---*/ + (void)__Pyx_modinit_global_init_code(); + (void)__Pyx_modinit_variable_export_code(); + (void)__Pyx_modinit_function_export_code(); + if (unlikely(__Pyx_modinit_type_init_code() != 0)) goto __pyx_L1_error; + (void)__Pyx_modinit_type_import_code(); + (void)__Pyx_modinit_variable_import_code(); + (void)__Pyx_modinit_function_import_code(); /*--- Execution code ---*/ #if defined(__Pyx_Generator_USED) || defined(__Pyx_Coroutine_USED) if (__Pyx_patch_abc() < 0) __PYX_ERR(0, 1, __pyx_L1_error) #endif - /* "gensim/corpora/_mmreader.pyx":6 + /* "gensim/corpora/_mmreader.pyx":8 * from __future__ import with_statement * * from gensim import utils # <<<<<<<<<<<<<< * * from six import string_types */ - __pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 6, __pyx_L1_error) + __pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 8, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_INCREF(__pyx_n_s_utils); __Pyx_GIVEREF(__pyx_n_s_utils); PyList_SET_ITEM(__pyx_t_1, 0, __pyx_n_s_utils); - __pyx_t_2 = __Pyx_Import(__pyx_n_s_gensim, __pyx_t_1, -1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 6, __pyx_L1_error) + __pyx_t_2 = __Pyx_Import(__pyx_n_s_gensim, __pyx_t_1, -1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 8, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_2, __pyx_n_s_utils); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 6, __pyx_L1_error) + __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_2, __pyx_n_s_utils); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 8, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_utils, __pyx_t_1) < 0) __PYX_ERR(0, 6, __pyx_L1_error) + if (PyDict_SetItem(__pyx_d, __pyx_n_s_utils, __pyx_t_1) < 0) __PYX_ERR(0, 8, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - /* "gensim/corpora/_mmreader.pyx":8 + /* "gensim/corpora/_mmreader.pyx":10 * from gensim import utils * * from six import string_types # <<<<<<<<<<<<<< * from six.moves import xrange * import logging */ - __pyx_t_2 = PyList_New(1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 8, __pyx_L1_error) + __pyx_t_2 = PyList_New(1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_INCREF(__pyx_n_s_string_types); __Pyx_GIVEREF(__pyx_n_s_string_types); PyList_SET_ITEM(__pyx_t_2, 0, __pyx_n_s_string_types); - __pyx_t_1 = __Pyx_Import(__pyx_n_s_six, __pyx_t_2, -1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 8, __pyx_L1_error) + __pyx_t_1 = __Pyx_Import(__pyx_n_s_six, __pyx_t_2, -1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - __pyx_t_2 = __Pyx_ImportFrom(__pyx_t_1, __pyx_n_s_string_types); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 8, __pyx_L1_error) + __pyx_t_2 = __Pyx_ImportFrom(__pyx_t_1, __pyx_n_s_string_types); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_string_types, __pyx_t_2) < 0) __PYX_ERR(0, 8, __pyx_L1_error) + if (PyDict_SetItem(__pyx_d, __pyx_n_s_string_types, __pyx_t_2) < 0) __PYX_ERR(0, 10, __pyx_L1_error) __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - /* "gensim/corpora/_mmreader.pyx":9 + /* "gensim/corpora/_mmreader.pyx":11 * * from six import string_types * from six.moves import xrange # <<<<<<<<<<<<<< * import logging * */ - __pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 9, __pyx_L1_error) + __pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_INCREF(__pyx_n_s_xrange); __Pyx_GIVEREF(__pyx_n_s_xrange); PyList_SET_ITEM(__pyx_t_1, 0, __pyx_n_s_xrange); - __pyx_t_2 = __Pyx_Import(__pyx_n_s_six_moves, __pyx_t_1, -1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 9, __pyx_L1_error) + __pyx_t_2 = __Pyx_Import(__pyx_n_s_six_moves, __pyx_t_1, -1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_2, __pyx_n_s_xrange); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 9, __pyx_L1_error) + __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_2, __pyx_n_s_xrange); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_xrange, __pyx_t_1) < 0) __PYX_ERR(0, 9, __pyx_L1_error) + if (PyDict_SetItem(__pyx_d, __pyx_n_s_xrange, __pyx_t_1) < 0) __PYX_ERR(0, 11, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - /* "gensim/corpora/_mmreader.pyx":10 + /* "gensim/corpora/_mmreader.pyx":12 * from six import string_types * from six.moves import xrange * import logging # <<<<<<<<<<<<<< * * cimport cython */ - __pyx_t_2 = __Pyx_Import(__pyx_n_s_logging, 0, -1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 10, __pyx_L1_error) + __pyx_t_2 = __Pyx_Import(__pyx_n_s_logging, 0, -1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 12, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_logging, __pyx_t_2) < 0) __PYX_ERR(0, 10, __pyx_L1_error) + if (PyDict_SetItem(__pyx_d, __pyx_n_s_logging, __pyx_t_2) < 0) __PYX_ERR(0, 12, __pyx_L1_error) __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - /* "gensim/corpora/_mmreader.pyx":16 + /* "gensim/corpora/_mmreader.pyx":18 * * * logger = logging.getLogger(__name__) # <<<<<<<<<<<<<< * * */ - __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_logging); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 16, __pyx_L1_error) + __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_logging); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 18, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_getLogger); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 18, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_getLogger); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 16, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_name); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 18, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_1, __pyx_t_2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 18, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_name); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 16, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_1); - __pyx_t_4 = NULL; - if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_3))) { - __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_3); - if (likely(__pyx_t_4)) { - PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_3); - __Pyx_INCREF(__pyx_t_4); - __Pyx_INCREF(function); - __Pyx_DECREF_SET(__pyx_t_3, function); - } - } - if (!__pyx_t_4) { - __pyx_t_2 = __Pyx_PyObject_CallOneArg(__pyx_t_3, __pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 16, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __Pyx_GOTREF(__pyx_t_2); - } else { - #if CYTHON_FAST_PYCALL - if (PyFunction_Check(__pyx_t_3)) { - PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_1}; - __pyx_t_2 = __Pyx_PyFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 16, __pyx_L1_error) - __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0; - __Pyx_GOTREF(__pyx_t_2); - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - } else - #endif - #if CYTHON_FAST_PYCCALL - if (__Pyx_PyFastCFunction_Check(__pyx_t_3)) { - PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_1}; - __pyx_t_2 = __Pyx_PyCFunction_FastCall(__pyx_t_3, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 16, __pyx_L1_error) - __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0; - __Pyx_GOTREF(__pyx_t_2); - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - } else - #endif - { - __pyx_t_5 = PyTuple_New(1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 16, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_5); - __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_4); __pyx_t_4 = NULL; - __Pyx_GIVEREF(__pyx_t_1); - PyTuple_SET_ITEM(__pyx_t_5, 0+1, __pyx_t_1); - __pyx_t_1 = 0; - __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_3, __pyx_t_5, NULL); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 16, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_2); - __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; - } - } - __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - if (PyDict_SetItem(__pyx_d, __pyx_n_s_logger, __pyx_t_2) < 0) __PYX_ERR(0, 16, __pyx_L1_error) __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + if (PyDict_SetItem(__pyx_d, __pyx_n_s_logger, __pyx_t_3) < 0) __PYX_ERR(0, 18, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; /* "(tree fragment)":1 * def __pyx_unpickle_MmReader(__pyx_type, long __pyx_checksum, __pyx_state): # <<<<<<<<<<<<<< * if __pyx_checksum != 0xea5fe92: * from pickle import PickleError as __pyx_PickleError */ - __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_6gensim_7corpora_9_mmreader_1__pyx_unpickle_MmReader, NULL, __pyx_n_s_gensim_corpora__mmreader); if (unlikely(!__pyx_t_2)) __PYX_ERR(1, 1, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_2); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_pyx_unpickle_MmReader, __pyx_t_2) < 0) __PYX_ERR(1, 1, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_t_3 = PyCFunction_NewEx(&__pyx_mdef_6gensim_7corpora_9_mmreader_1__pyx_unpickle_MmReader, NULL, __pyx_n_s_gensim_corpora__mmreader); if (unlikely(!__pyx_t_3)) __PYX_ERR(1, 1, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + if (PyDict_SetItem(__pyx_d, __pyx_n_s_pyx_unpickle_MmReader, __pyx_t_3) < 0) __PYX_ERR(1, 1, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; /* "gensim/corpora/_mmreader.pyx":1 * # Copyright (C) 2018 Radim Rehurek # <<<<<<<<<<<<<< * # cython: embedsignature=True - * """Reader for corpus in the Matrix Market format.""" + * */ - __pyx_t_2 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 1, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_2); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_2) < 0) __PYX_ERR(0, 1, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __pyx_t_3 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 1, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_3) < 0) __PYX_ERR(0, 1, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; /*--- Wrapped vars code ---*/ @@ -7047,8 +7141,6 @@ static int __pyx_pymod_exec__mmreader(PyObject *__pyx_pyinit_module) __Pyx_XDECREF(__pyx_t_1); __Pyx_XDECREF(__pyx_t_2); __Pyx_XDECREF(__pyx_t_3); - __Pyx_XDECREF(__pyx_t_4); - __Pyx_XDECREF(__pyx_t_5); if (__pyx_m) { if (__pyx_d) { __Pyx_AddTraceback("init gensim.corpora._mmreader", 0, __pyx_lineno, __pyx_filename); @@ -7086,6 +7178,20 @@ static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname) { } #endif +/* PyObjectGetAttrStr */ +#if CYTHON_USE_TYPE_SLOTS +static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name) { + PyTypeObject* tp = Py_TYPE(obj); + if (likely(tp->tp_getattro)) + return tp->tp_getattro(obj, attr_name); +#if PY_MAJOR_VERSION < 3 + if (likely(tp->tp_getattr)) + return tp->tp_getattr(obj, PyString_AS_STRING(attr_name)); +#endif + return PyObject_GetAttr(obj, attr_name); +} +#endif + /* GetBuiltinName */ static PyObject *__Pyx_GetBuiltinName(PyObject *name) { PyObject* result = __Pyx_PyObject_GetAttrStr(__pyx_b, name); @@ -7495,10 +7601,19 @@ static CYTHON_INLINE PyObject* __Pyx_PyObject_CallNoArg(PyObject *func) { static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name) { PyObject *result; #if !CYTHON_AVOID_BORROWED_REFS +#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030500A1 + result = _PyDict_GetItem_KnownHash(__pyx_d, name, ((PyASCIIObject *) name)->hash); + if (likely(result)) { + Py_INCREF(result); + } else if (unlikely(PyErr_Occurred())) { + result = NULL; + } else { +#else result = PyDict_GetItem(__pyx_d, name); if (likely(result)) { Py_INCREF(result); } else { +#endif #else result = PyObject_GetItem(__pyx_d, name); if (!result) { @@ -7510,7 +7625,7 @@ static CYTHON_INLINE PyObject* __Pyx_PyObject_CallNoArg(PyObject *func) { } /* PyErrFetchRestore */ - #if CYTHON_FAST_THREAD_STATE + #if CYTHON_FAST_THREAD_STATE static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) { PyObject *tmp_type, *tmp_value, *tmp_tb; tmp_type = tstate->curexc_type; @@ -7534,18 +7649,16 @@ static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject #endif /* IterNext */ - static PyObject *__Pyx_PyIter_Next2Default(PyObject* defval) { + static PyObject *__Pyx_PyIter_Next2Default(PyObject* defval) { PyObject* exc_type; __Pyx_PyThreadState_declare __Pyx_PyThreadState_assign exc_type = __Pyx_PyErr_Occurred(); if (unlikely(exc_type)) { - if (unlikely(!__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) + if (!defval || unlikely(!__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) return NULL; - if (defval) { - __Pyx_PyErr_Clear(); - Py_INCREF(defval); - } + __Pyx_PyErr_Clear(); + Py_INCREF(defval); return defval; } if (defval) { @@ -7576,15 +7689,22 @@ static CYTHON_INLINE PyObject *__Pyx_PyIter_Next2(PyObject* iterator, PyObject* if (likely(next)) return next; #endif - } else if (CYTHON_USE_TYPE_SLOTS || !PyIter_Check(iterator)) { + } else if (CYTHON_USE_TYPE_SLOTS || unlikely(!PyIter_Check(iterator))) { __Pyx_PyIter_Next_ErrorNoIterator(iterator); return NULL; } +#if !CYTHON_USE_TYPE_SLOTS + else { + next = PyIter_Next(iterator); + if (likely(next)) + return next; + } +#endif return __Pyx_PyIter_Next2Default(defval); } /* RaiseException */ - #if PY_MAJOR_VERSION < 3 + #if PY_MAJOR_VERSION < 3 static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, CYTHON_UNUSED PyObject *cause) { __Pyx_PyThreadState_declare @@ -7743,7 +7863,7 @@ static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject #endif /* SaveResetException */ - #if CYTHON_FAST_THREAD_STATE + #if CYTHON_FAST_THREAD_STATE static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) { #if PY_VERSION_HEX >= 0x030700A2 *type = tstate->exc_state.exc_type; @@ -7782,7 +7902,7 @@ static CYTHON_INLINE void __Pyx__ExceptionReset(PyThreadState *tstate, PyObject #endif /* PyErrExceptionMatches */ - #if CYTHON_FAST_THREAD_STATE + #if CYTHON_FAST_THREAD_STATE static int __Pyx_PyErr_ExceptionMatchesTuple(PyObject *exc_type, PyObject *tuple) { Py_ssize_t i, n; n = PyTuple_GET_SIZE(tuple); @@ -7807,7 +7927,7 @@ static CYTHON_INLINE int __Pyx_PyErr_ExceptionMatchesInState(PyThreadState* tsta #endif /* PyIntBinop */ - #if !CYTHON_COMPILING_IN_PYPY + #if !CYTHON_COMPILING_IN_PYPY static PyObject* __Pyx_PyInt_AddObjC(PyObject *op1, PyObject *op2, CYTHON_UNUSED long intval, CYTHON_UNUSED int inplace) { #if PY_MAJOR_VERSION < 3 if (likely(PyInt_CheckExact(op1))) { @@ -7845,6 +7965,7 @@ static PyObject* __Pyx_PyInt_AddObjC(PyObject *op1, PyObject *op2, CYTHON_UNUSED goto long_long; #endif } + CYTHON_FALLTHROUGH; case 2: if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) { a = (long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])); @@ -7855,6 +7976,7 @@ static PyObject* __Pyx_PyInt_AddObjC(PyObject *op1, PyObject *op2, CYTHON_UNUSED goto long_long; #endif } + CYTHON_FALLTHROUGH; case -3: if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) { a = -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])); @@ -7865,6 +7987,7 @@ static PyObject* __Pyx_PyInt_AddObjC(PyObject *op1, PyObject *op2, CYTHON_UNUSED goto long_long; #endif } + CYTHON_FALLTHROUGH; case 3: if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) { a = (long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])); @@ -7875,6 +7998,7 @@ static PyObject* __Pyx_PyInt_AddObjC(PyObject *op1, PyObject *op2, CYTHON_UNUSED goto long_long; #endif } + CYTHON_FALLTHROUGH; case -4: if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) { a = -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])); @@ -7885,6 +8009,7 @@ static PyObject* __Pyx_PyInt_AddObjC(PyObject *op1, PyObject *op2, CYTHON_UNUSED goto long_long; #endif } + CYTHON_FALLTHROUGH; case 4: if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) { a = (long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])); @@ -7895,6 +8020,7 @@ static PyObject* __Pyx_PyInt_AddObjC(PyObject *op1, PyObject *op2, CYTHON_UNUSED goto long_long; #endif } + CYTHON_FALLTHROUGH; default: return PyLong_Type.tp_as_number->nb_add(op1, op2); } } @@ -7923,20 +8049,20 @@ static PyObject* __Pyx_PyInt_AddObjC(PyObject *op1, PyObject *op2, CYTHON_UNUSED #endif /* RaiseTooManyValuesToUnpack */ - static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected) { + static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected) { PyErr_Format(PyExc_ValueError, "too many values to unpack (expected %" CYTHON_FORMAT_SSIZE_T "d)", expected); } /* RaiseNeedMoreValuesToUnpack */ - static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index) { + static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index) { PyErr_Format(PyExc_ValueError, "need more than %" CYTHON_FORMAT_SSIZE_T "d value%.1s to unpack", index, (index == 1) ? "" : "s"); } /* IterFinish */ - static CYTHON_INLINE int __Pyx_IterFinish(void) { + static CYTHON_INLINE int __Pyx_IterFinish(void) { #if CYTHON_FAST_THREAD_STATE PyThreadState *tstate = __Pyx_PyThreadState_Current; PyObject* exc_type = tstate->curexc_type; @@ -7971,7 +8097,7 @@ static PyObject* __Pyx_PyInt_AddObjC(PyObject *op1, PyObject *op2, CYTHON_UNUSED } /* UnpackItemEndCheck */ - static int __Pyx_IternextUnpackEndCheck(PyObject *retval, Py_ssize_t expected) { + static int __Pyx_IternextUnpackEndCheck(PyObject *retval, Py_ssize_t expected) { if (unlikely(retval)) { Py_DECREF(retval); __Pyx_RaiseTooManyValuesError(expected); @@ -7983,7 +8109,7 @@ static PyObject* __Pyx_PyInt_AddObjC(PyObject *op1, PyObject *op2, CYTHON_UNUSED } /* GetException */ - #if CYTHON_FAST_THREAD_STATE + #if CYTHON_FAST_THREAD_STATE static int __Pyx__GetException(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) { #else static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb) { @@ -8053,12 +8179,12 @@ static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb) } /* None */ - static CYTHON_INLINE void __Pyx_RaiseUnboundLocalError(const char *varname) { + static CYTHON_INLINE void __Pyx_RaiseUnboundLocalError(const char *varname) { PyErr_Format(PyExc_UnboundLocalError, "local variable '%s' referenced before assignment", varname); } /* PyIntBinop */ - #if !CYTHON_COMPILING_IN_PYPY + #if !CYTHON_COMPILING_IN_PYPY static PyObject* __Pyx_PyInt_EqObjC(PyObject *op1, PyObject *op2, CYTHON_UNUSED long intval, CYTHON_UNUSED int inplace) { if (op1 == op2) { Py_RETURN_TRUE; @@ -8090,31 +8216,37 @@ static PyObject* __Pyx_PyInt_EqObjC(PyObject *op1, PyObject *op2, CYTHON_UNUSED a = -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])); break; } + CYTHON_FALLTHROUGH; case 2: if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) { a = (long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])); break; } + CYTHON_FALLTHROUGH; case -3: if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) { a = -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])); break; } + CYTHON_FALLTHROUGH; case 3: if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) { a = (long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])); break; } + CYTHON_FALLTHROUGH; case -4: if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) { a = -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])); break; } + CYTHON_FALLTHROUGH; case 4: if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) { a = (long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])); break; } + CYTHON_FALLTHROUGH; #if PyLong_SHIFT < 30 && PyLong_SHIFT != 15 default: return PyLong_Type.tp_richcompare(op1, op2, Py_EQ); #else @@ -8143,7 +8275,7 @@ static PyObject* __Pyx_PyInt_EqObjC(PyObject *op1, PyObject *op2, CYTHON_UNUSED #endif /* GetAttr */ - static CYTHON_INLINE PyObject *__Pyx_GetAttr(PyObject *o, PyObject *n) { + static CYTHON_INLINE PyObject *__Pyx_GetAttr(PyObject *o, PyObject *n) { #if CYTHON_USE_TYPE_SLOTS #if PY_MAJOR_VERSION >= 3 if (likely(PyUnicode_Check(n))) @@ -8156,7 +8288,7 @@ static PyObject* __Pyx_PyInt_EqObjC(PyObject *op1, PyObject *op2, CYTHON_UNUSED } /* GetAttr3 */ - static PyObject *__Pyx_GetAttr3Default(PyObject *d) { + static PyObject *__Pyx_GetAttr3Default(PyObject *d) { __Pyx_PyThreadState_declare __Pyx_PyThreadState_assign if (unlikely(!__Pyx_PyErr_ExceptionMatches(PyExc_AttributeError))) @@ -8171,7 +8303,7 @@ static CYTHON_INLINE PyObject *__Pyx_GetAttr3(PyObject *o, PyObject *n, PyObject } /* Import */ - static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level) { + static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level) { PyObject *empty_list = 0; PyObject *module = 0; PyObject *global_dict = 0; @@ -8236,7 +8368,7 @@ static CYTHON_INLINE PyObject *__Pyx_GetAttr3(PyObject *o, PyObject *n, PyObject } /* ImportFrom */ - static PyObject* __Pyx_ImportFrom(PyObject* module, PyObject* name) { + static PyObject* __Pyx_ImportFrom(PyObject* module, PyObject* name) { PyObject* value = __Pyx_PyObject_GetAttrStr(module, name); if (unlikely(!value) && PyErr_ExceptionMatches(PyExc_AttributeError)) { PyErr_Format(PyExc_ImportError, @@ -8250,7 +8382,7 @@ static CYTHON_INLINE PyObject *__Pyx_GetAttr3(PyObject *o, PyObject *n, PyObject } /* GetItemInt */ - static PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j) { + static PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j) { PyObject *r; if (!j) return NULL; r = PyObject_GetItem(o, j); @@ -8337,7 +8469,7 @@ static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssize_t i, } /* HasAttr */ - static CYTHON_INLINE int __Pyx_HasAttr(PyObject *o, PyObject *n) { + static CYTHON_INLINE int __Pyx_HasAttr(PyObject *o, PyObject *n) { PyObject *r; if (unlikely(!__Pyx_PyBaseString_Check(n))) { PyErr_SetString(PyExc_TypeError, @@ -8354,8 +8486,58 @@ static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssize_t i, } } +/* PyObject_GenericGetAttrNoDict */ + #if CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP && PY_VERSION_HEX < 0x03070000 +static PyObject *__Pyx_RaiseGenericGetAttributeError(PyTypeObject *tp, PyObject *attr_name) { + PyErr_Format(PyExc_AttributeError, +#if PY_MAJOR_VERSION >= 3 + "'%.50s' object has no attribute '%U'", + tp->tp_name, attr_name); +#else + "'%.50s' object has no attribute '%.400s'", + tp->tp_name, PyString_AS_STRING(attr_name)); +#endif + return NULL; +} +static CYTHON_INLINE PyObject* __Pyx_PyObject_GenericGetAttrNoDict(PyObject* obj, PyObject* attr_name) { + PyObject *descr; + PyTypeObject *tp = Py_TYPE(obj); + if (unlikely(!PyString_Check(attr_name))) { + return PyObject_GenericGetAttr(obj, attr_name); + } + assert(!tp->tp_dictoffset); + descr = _PyType_Lookup(tp, attr_name); + if (unlikely(!descr)) { + return __Pyx_RaiseGenericGetAttributeError(tp, attr_name); + } + Py_INCREF(descr); + #if PY_MAJOR_VERSION < 3 + if (likely(PyType_HasFeature(Py_TYPE(descr), Py_TPFLAGS_HAVE_CLASS))) + #endif + { + descrgetfunc f = Py_TYPE(descr)->tp_descr_get; + if (unlikely(f)) { + PyObject *res = f(descr, obj, (PyObject *)tp); + Py_DECREF(descr); + return res; + } + } + return descr; +} +#endif + +/* PyObject_GenericGetAttr */ + #if CYTHON_USE_TYPE_SLOTS && CYTHON_USE_PYTYPE_LOOKUP && PY_VERSION_HEX < 0x03070000 +static PyObject* __Pyx_PyObject_GenericGetAttr(PyObject* obj, PyObject* attr_name) { + if (unlikely(Py_TYPE(obj)->tp_dictoffset)) { + return PyObject_GenericGetAttr(obj, attr_name); + } + return __Pyx_PyObject_GenericGetAttrNoDict(obj, attr_name); +} +#endif + /* SetupReduce */ - static int __Pyx_setup_reduce_is_named(PyObject* meth, PyObject* name) { + static int __Pyx_setup_reduce_is_named(PyObject* meth, PyObject* name) { int ret; PyObject *name_attr; name_attr = __Pyx_PyObject_GetAttrStr(meth, __pyx_n_s_name); @@ -8431,18 +8613,21 @@ static int __Pyx_setup_reduce(PyObject* type_obj) { } /* CLineInTraceback */ - #ifndef CYTHON_CLINE_IN_TRACEBACK + #ifndef CYTHON_CLINE_IN_TRACEBACK static int __Pyx_CLineForTraceback(CYTHON_UNUSED PyThreadState *tstate, int c_line) { PyObject *use_cline; PyObject *ptype, *pvalue, *ptraceback; #if CYTHON_COMPILING_IN_CPYTHON PyObject **cython_runtime_dict; #endif + if (unlikely(!__pyx_cython_runtime)) { + return c_line; + } __Pyx_ErrFetchInState(tstate, &ptype, &pvalue, &ptraceback); #if CYTHON_COMPILING_IN_CPYTHON cython_runtime_dict = _PyObject_GetDictPtr(__pyx_cython_runtime); if (likely(cython_runtime_dict)) { - use_cline = PyDict_GetItem(*cython_runtime_dict, __pyx_n_s_cline_in_traceback); + use_cline = __Pyx_PyDict_GetItemStr(*cython_runtime_dict, __pyx_n_s_cline_in_traceback); } else #endif { @@ -8468,7 +8653,7 @@ static int __Pyx_CLineForTraceback(CYTHON_UNUSED PyThreadState *tstate, int c_li #endif /* CodeObjectCache */ - static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line) { + static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line) { int start = 0, mid = 0, end = count - 1; if (end >= 0 && code_line > entries[end].code_line) { return count; @@ -8548,7 +8733,7 @@ static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object) { } /* AddTraceback */ - #include "compile.h" + #include "compile.h" #include "frameobject.h" #include "traceback.h" static PyCodeObject* __Pyx_CreateCodeObjectForTraceback( @@ -8633,7 +8818,7 @@ static void __Pyx_AddTraceback(const char *funcname, int c_line, } /* CIntToPy */ - static CYTHON_INLINE PyObject* __Pyx_PyInt_From_PY_LONG_LONG(PY_LONG_LONG value) { + static CYTHON_INLINE PyObject* __Pyx_PyInt_From_PY_LONG_LONG(PY_LONG_LONG value) { const PY_LONG_LONG neg_one = (PY_LONG_LONG) -1, const_zero = (PY_LONG_LONG) 0; const int is_unsigned = neg_one > const_zero; if (is_unsigned) { @@ -8664,7 +8849,7 @@ static void __Pyx_AddTraceback(const char *funcname, int c_line, } /* CIntFromPyVerify */ - #define __PYX_VERIFY_RETURN_INT(target_type, func_type, func_value)\ + #define __PYX_VERIFY_RETURN_INT(target_type, func_type, func_value)\ __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 0) #define __PYX_VERIFY_RETURN_INT_EXC(target_type, func_type, func_value)\ __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 1) @@ -8686,7 +8871,7 @@ static void __Pyx_AddTraceback(const char *funcname, int c_line, } /* CIntToPy */ - static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value) { + static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value) { const long neg_one = (long) -1, const_zero = (long) 0; const int is_unsigned = neg_one > const_zero; if (is_unsigned) { @@ -8717,7 +8902,7 @@ static void __Pyx_AddTraceback(const char *funcname, int c_line, } /* CIntFromPy */ - static CYTHON_INLINE PY_LONG_LONG __Pyx_PyInt_As_PY_LONG_LONG(PyObject *x) { + static CYTHON_INLINE PY_LONG_LONG __Pyx_PyInt_As_PY_LONG_LONG(PyObject *x) { const PY_LONG_LONG neg_one = (PY_LONG_LONG) -1, const_zero = (PY_LONG_LONG) 0; const int is_unsigned = neg_one > const_zero; #if PY_MAJOR_VERSION < 3 @@ -8906,7 +9091,7 @@ static void __Pyx_AddTraceback(const char *funcname, int c_line, } /* CIntFromPy */ - static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *x) { + static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *x) { const long neg_one = (long) -1, const_zero = (long) 0; const int is_unsigned = neg_one > const_zero; #if PY_MAJOR_VERSION < 3 @@ -9095,7 +9280,7 @@ static void __Pyx_AddTraceback(const char *funcname, int c_line, } /* CIntFromPy */ - static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *x) { + static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *x) { const int neg_one = (int) -1, const_zero = (int) 0; const int is_unsigned = neg_one > const_zero; #if PY_MAJOR_VERSION < 3 @@ -9284,7 +9469,7 @@ static void __Pyx_AddTraceback(const char *funcname, int c_line, } /* FastTypeChecks */ - #if CYTHON_COMPILING_IN_CPYTHON + #if CYTHON_COMPILING_IN_CPYTHON static int __Pyx_InBases(PyTypeObject *a, PyTypeObject *b) { while (a) { a = a->tp_base; @@ -9356,7 +9541,7 @@ static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObj #endif /* FetchCommonType */ - static PyTypeObject* __Pyx_FetchCommonType(PyTypeObject* type) { + static PyTypeObject* __Pyx_FetchCommonType(PyTypeObject* type) { PyObject* fake_module; PyTypeObject* cached_type = NULL; fake_module = PyImport_AddModule((char*) "_cython_" CYTHON_ABI); @@ -9395,7 +9580,7 @@ static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObj } /* SwapException */ - #if CYTHON_FAST_THREAD_STATE + #if CYTHON_FAST_THREAD_STATE static CYTHON_INLINE void __Pyx__ExceptionSwap(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) { PyObject *tmp_type, *tmp_value, *tmp_tb; #if PY_VERSION_HEX >= 0x030700A2 @@ -9429,7 +9614,7 @@ static CYTHON_INLINE void __Pyx_ExceptionSwap(PyObject **type, PyObject **value, #endif /* PyObjectCallMethod1 */ - static PyObject* __Pyx__PyObject_CallMethod1(PyObject* method, PyObject* arg) { + static PyObject* __Pyx__PyObject_CallMethod1(PyObject* method, PyObject* arg) { PyObject *result = NULL; #if CYTHON_UNPACK_METHODS if (likely(PyMethod_Check(method))) { @@ -9471,17 +9656,16 @@ static CYTHON_INLINE void __Pyx_ExceptionSwap(PyObject **type, PyObject **value, return result; } static PyObject* __Pyx_PyObject_CallMethod1(PyObject* obj, PyObject* method_name, PyObject* arg) { - PyObject *method, *result = NULL; + PyObject *method, *result; method = __Pyx_PyObject_GetAttrStr(obj, method_name); - if (unlikely(!method)) goto done; + if (unlikely(!method)) return NULL; result = __Pyx__PyObject_CallMethod1(method, arg); -done: - Py_XDECREF(method); + Py_DECREF(method); return result; } /* CoroutineBase */ - #include + #include #include #define __Pyx_Coroutine_Undelegate(gen) Py_CLEAR((gen)->yieldfrom) static int __Pyx_PyGen__FetchStopIterationValue(CYTHON_UNUSED PyThreadState *__pyx_tstate, PyObject **pvalue) { @@ -9580,7 +9764,7 @@ static void __Pyx__Coroutine_AlreadyRunningError(CYTHON_UNUSED __pyx_CoroutineOb const char *msg; if (0) { #ifdef __Pyx_Coroutine_USED - } else if (__Pyx_Coroutine_CheckExact((PyObject*)gen)) { + } else if (__Pyx_Coroutine_Check((PyObject*)gen)) { msg = "coroutine already executing"; #endif #ifdef __Pyx_AsyncGen_USED @@ -9597,7 +9781,7 @@ static void __Pyx__Coroutine_NotStartedError(CYTHON_UNUSED PyObject *gen) { const char *msg; if (0) { #ifdef __Pyx_Coroutine_USED - } else if (__Pyx_Coroutine_CheckExact(gen)) { + } else if (__Pyx_Coroutine_Check(gen)) { msg = "can't send non-None value to a just-started coroutine"; #endif #ifdef __Pyx_AsyncGen_USED @@ -9612,7 +9796,7 @@ static void __Pyx__Coroutine_NotStartedError(CYTHON_UNUSED PyObject *gen) { #define __Pyx_Coroutine_AlreadyTerminatedError(gen, value, closing) (__Pyx__Coroutine_AlreadyTerminatedError(gen, value, closing), (PyObject*)NULL) static void __Pyx__Coroutine_AlreadyTerminatedError(CYTHON_UNUSED PyObject *gen, PyObject *value, CYTHON_UNUSED int closing) { #ifdef __Pyx_Coroutine_USED - if (!closing && __Pyx_Coroutine_CheckExact(gen)) { + if (!closing && __Pyx_Coroutine_Check(gen)) { PyErr_SetString(PyExc_RuntimeError, "cannot reuse already awaited coroutine"); } else #endif @@ -9718,7 +9902,7 @@ static PyObject *__Pyx_Coroutine_Send(PyObject *self, PyObject *value) { } else #endif #ifdef __Pyx_Coroutine_USED - if (__Pyx_Coroutine_CheckExact(yf)) { + if (__Pyx_Coroutine_Check(yf)) { ret = __Pyx_Coroutine_Send(yf, value); } else #endif @@ -9764,7 +9948,7 @@ static int __Pyx_Coroutine_CloseIter(__pyx_CoroutineObject *gen, PyObject *yf) { } else #endif #ifdef __Pyx_Coroutine_USED - if (__Pyx_Coroutine_CheckExact(yf)) { + if (__Pyx_Coroutine_Check(yf)) { retval = __Pyx_Coroutine_Close(yf); if (!retval) return -1; @@ -9820,6 +10004,11 @@ static PyObject *__Pyx_Generator_Next(PyObject *self) { if (PyGen_CheckExact(yf)) { ret = _PyGen_Send((PyGenObject*)yf, NULL); } else + #endif + #ifdef __Pyx_Coroutine_USED + if (__Pyx_Coroutine_Check(yf)) { + ret = __Pyx_Coroutine_Send(yf, Py_None); + } else #endif ret = Py_TYPE(yf)->tp_iternext(yf); gen->is_running = 0; @@ -9851,7 +10040,7 @@ static PyObject *__Pyx_Coroutine_Close(PyObject *self) { Py_DECREF(retval); if ((0)) { #ifdef __Pyx_Coroutine_USED - } else if (__Pyx_Coroutine_CheckExact(self)) { + } else if (__Pyx_Coroutine_Check(self)) { msg = "coroutine ignored GeneratorExit"; #endif #ifdef __Pyx_AsyncGen_USED @@ -9899,7 +10088,7 @@ static PyObject *__Pyx__Coroutine_Throw(PyObject *self, PyObject *typ, PyObject || __Pyx_Generator_CheckExact(yf) #endif #ifdef __Pyx_Coroutine_USED - || __Pyx_Coroutine_CheckExact(yf) + || __Pyx_Coroutine_Check(yf) #endif ) { ret = __Pyx__Coroutine_Throw(yf, typ, val, tb, args, close_on_genexit); @@ -9968,6 +10157,7 @@ static int __Pyx_Coroutine_clear(PyObject *self) { Py_CLEAR(((__pyx_PyAsyncGenObject*)gen)->ag_finalizer); } #endif + Py_CLEAR(gen->gi_code); Py_CLEAR(gen->gi_name); Py_CLEAR(gen->gi_qualname); Py_CLEAR(gen->gi_modulename); @@ -10153,15 +10343,15 @@ __Pyx_Coroutine_set_qualname(__pyx_CoroutineObject *self, PyObject *value) return 0; } static __pyx_CoroutineObject *__Pyx__Coroutine_New( - PyTypeObject* type, __pyx_coroutine_body_t body, PyObject *closure, + PyTypeObject* type, __pyx_coroutine_body_t body, PyObject *code, PyObject *closure, PyObject *name, PyObject *qualname, PyObject *module_name) { __pyx_CoroutineObject *gen = PyObject_GC_New(__pyx_CoroutineObject, type); if (unlikely(!gen)) return NULL; - return __Pyx__Coroutine_NewInit(gen, body, closure, name, qualname, module_name); + return __Pyx__Coroutine_NewInit(gen, body, code, closure, name, qualname, module_name); } static __pyx_CoroutineObject *__Pyx__Coroutine_NewInit( - __pyx_CoroutineObject *gen, __pyx_coroutine_body_t body, PyObject *closure, + __pyx_CoroutineObject *gen, __pyx_coroutine_body_t body, PyObject *code, PyObject *closure, PyObject *name, PyObject *qualname, PyObject *module_name) { gen->body = body; gen->closure = closure; @@ -10180,12 +10370,14 @@ static __pyx_CoroutineObject *__Pyx__Coroutine_NewInit( gen->gi_name = name; Py_XINCREF(module_name); gen->gi_modulename = module_name; + Py_XINCREF(code); + gen->gi_code = code; PyObject_GC_Track(gen); return gen; } /* PatchModuleWithCoroutine */ - static PyObject* __Pyx_Coroutine_patch_module(PyObject* module, const char* py_code) { + static PyObject* __Pyx_Coroutine_patch_module(PyObject* module, const char* py_code) { #if defined(__Pyx_Generator_USED) || defined(__Pyx_Coroutine_USED) int result; PyObject *globals, *result_obj; @@ -10225,7 +10417,7 @@ static __pyx_CoroutineObject *__Pyx__Coroutine_NewInit( } /* PatchGeneratorABC */ - #ifndef CYTHON_REGISTER_ABCS + #ifndef CYTHON_REGISTER_ABCS #define CYTHON_REGISTER_ABCS 1 #endif #if defined(__Pyx_Generator_USED) || defined(__Pyx_Coroutine_USED) @@ -10282,7 +10474,7 @@ static int __Pyx_patch_abc(void) { } /* Generator */ - static PyMethodDef __pyx_Generator_methods[] = { + static PyMethodDef __pyx_Generator_methods[] = { {"send", (PyCFunction) __Pyx_Coroutine_Send, METH_O, (char*) PyDoc_STR("send(arg) -> send 'arg' into generator,\nreturn next yielded value or raise StopIteration.")}, {"throw", (PyCFunction) __Pyx_Coroutine_Throw, METH_VARARGS, @@ -10295,6 +10487,7 @@ static PyMemberDef __pyx_Generator_memberlist[] = { {(char *) "gi_running", T_BOOL, offsetof(__pyx_CoroutineObject, is_running), READONLY, NULL}, {(char*) "gi_yieldfrom", T_OBJECT, offsetof(__pyx_CoroutineObject, yieldfrom), READONLY, (char*) PyDoc_STR("object being iterated by 'yield from', or None")}, + {(char*) "gi_code", T_OBJECT, offsetof(__pyx_CoroutineObject, gi_code), READONLY, NULL}, {0, 0, 0, 0, 0} }; static PyGetSetDef __pyx_Generator_getsets[] = { @@ -10363,7 +10556,7 @@ static PyTypeObject __pyx_GeneratorType_type = { #endif }; static int __pyx_Generator_init(void) { - __pyx_GeneratorType_type.tp_getattro = PyObject_GenericGetAttr; + __pyx_GeneratorType_type.tp_getattro = __Pyx_PyObject_GenericGetAttrNoDict; __pyx_GeneratorType_type.tp_iter = PyObject_SelfIter; __pyx_GeneratorType = __Pyx_FetchCommonType(&__pyx_GeneratorType_type); if (unlikely(!__pyx_GeneratorType)) { @@ -10373,7 +10566,7 @@ static int __pyx_Generator_init(void) { } /* CheckBinaryVersion */ - static int __Pyx_check_binary_version(void) { + static int __Pyx_check_binary_version(void) { char ctversion[4], rtversion[4]; PyOS_snprintf(ctversion, 4, "%d.%d", PY_MAJOR_VERSION, PY_MINOR_VERSION); PyOS_snprintf(rtversion, 4, "%s", Py_GetVersion()); @@ -10389,7 +10582,7 @@ static int __pyx_Generator_init(void) { } /* InitStrings */ - static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) { + static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) { while (t->p) { #if PY_MAJOR_VERSION < 3 if (t->is_unicode) { @@ -10415,7 +10608,7 @@ static int __pyx_Generator_init(void) { if (!*t->p) return -1; if (PyObject_Hash(*t->p) == -1) - PyErr_Clear(); + return -1; ++t; } return 0; @@ -10629,6 +10822,9 @@ static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) { Py_DECREF(x); return ival; } +static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b) { + return b ? __Pyx_NewRef(Py_True) : __Pyx_NewRef(Py_False); +} static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) { return PyInt_FromSize_t(ival); } diff --git a/gensim/corpora/_mmreader.pyx b/gensim/corpora/_mmreader.pyx index 63b3d6cfd0..f4844127a3 100644 --- a/gensim/corpora/_mmreader.pyx +++ b/gensim/corpora/_mmreader.pyx @@ -1,6 +1,8 @@ # Copyright (C) 2018 Radim Rehurek # cython: embedsignature=True + """Reader for corpus in the Matrix Market format.""" + from __future__ import with_statement from gensim import utils @@ -17,7 +19,7 @@ logger = logging.getLogger(__name__) cdef class MmReader(object): - """Matrix market file reader (fast Cython version), used for :class:`~gensim.corpora.mmcorpus.MmCorpus`. + """Matrix market file reader (fast Cython version), used internally in :class:`~gensim.corpora.mmcorpus.MmCorpus`. Wrap a term-document matrix on disk (in matrix-market format), and present it as an object which supports iteration over the rows (~documents). @@ -32,10 +34,10 @@ cdef class MmReader(object): Number of non-zero terms. Notes - ---------- - Note that the file is read into memory one document at a time, not the whole - matrix at once (unlike scipy.io.mmread). This allows us to process corpora - which are larger than the available RAM. + ----- + Note that the file is read into memory one document at a time, not the whole matrix at once + (unlike e.g. `scipy.io.mmread` and other implementations). + This allows us to process corpora which are larger than the available RAM. """ cdef public input @@ -48,11 +50,11 @@ cdef class MmReader(object): Parameters ---------- input : {str, file-like object} - Path to input file in MM format or a file-like object that supports `seek()` - (e.g. :class:`~gzip.GzipFile`, :class:`~bz2.BZ2File`). + Path to the input file in MM format or a file-like object that supports `seek()` + (e.g. smart_open objects). transposed : bool, optional - if True, expects lines to represent doc_id, term_id, value. Else, expects term_id, doc_id, value. + Do lines represent `doc_id, term_id, value`, instead of `term_id, doc_id, value`? """ logger.info("initializing cython corpus reader from %s", input) @@ -83,7 +85,7 @@ cdef class MmReader(object): ) def __len__(self): - """Get size of corpus (number of documents).""" + """Get the corpus size: total number of documents.""" return self.num_docs def __str__(self): @@ -105,18 +107,18 @@ cdef class MmReader(object): break def __iter__(self): - """Iterate through corpus. + """Iterate through all documents in the corpus. Notes ------ Note that the total number of vectors returned is always equal to the number of rows specified - in the header, empty documents are inserted and yielded where appropriate, even if they are not explicitly + in the header: empty documents are inserted and yielded where appropriate, even if they are not explicitly stored in the Matrix Market file. Yields ------ (int, list of (int, number)) - Document id and Document in BoW format + Document id and document in sparse bag-of-words format. """ cdef long long docid, termid, previd @@ -165,17 +167,17 @@ cdef class MmReader(object): yield previd, [] def docbyoffset(self, offset): - """Get document at file offset `offset` (in bytes). + """Get the document at file offset `offset` (in bytes). Parameters ---------- offset : int - Offset, in bytes, of desired document. + File offset, in bytes, of the desired document. Returns ------ list of (int, str) - Document in BoW format. + Document in sparse bag-of-words format. """ # empty documents are not stored explicitly in MM format, so the index marks diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index 9a9edf24ed..84e2ed9945 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -4,8 +4,7 @@ # Copyright (C) 2010 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""This module implements the concept of Dictionary -- a mapping between words and their integer ids.""" +"""This module implements the concept of a Dictionary -- a mapping between words and their integer ids.""" from __future__ import with_statement @@ -30,20 +29,23 @@ class Dictionary(utils.SaveLoad, Mapping): """Dictionary encapsulates the mapping between normalized words and their integer ids. + Notable instance attributes: + Attributes - --------- + ---------- token2id : dict of (str, int) token -> tokenId. id2token : dict of (int, str) - Reverse mapping for token2id, initialized in lazy manner to save memory. + Reverse mapping for token2id, initialized in a lazy manner to save memory (not created until needed). dfs : dict of (int, int) - Document frequencies: token_id -> in how many documents contain this token. + Document frequencies: token_id -> how many documents contain this token. num_docs : int Number of documents processed. num_pos : int Total number of corpus positions (number of processed words). num_nnz : int - Total number of non-zeroes in the BOW matrix. + Total number of non-zeroes in the BOW matrix (sum of the number of unique + words per document over the entire corpus). """ def __init__(self, documents=None, prune_at=2000000): @@ -52,17 +54,17 @@ def __init__(self, documents=None, prune_at=2000000): Parameters ---------- documents : iterable of iterable of str, optional - Documents that used for initialization. + Documents to be used to initialize the mapping and collect corpus statistics. prune_at : int, optional - Total number of unique words. Dictionary will keep not more than `prune_at` words. + Dictionary will keep no more than `prune_at` words in its mapping, to limit its RAM footprint. Examples -------- >>> from gensim.corpora import Dictionary >>> >>> texts = [['human', 'interface', 'computer']] - >>> dct = Dictionary(texts) # fit dictionary - >>> dct.add_documents([["cat", "say", "meow"], ["dog"]]) # update dictionary with new documents + >>> dct = Dictionary(texts) # initialize a Dictionary + >>> dct.add_documents([["cat", "say", "meow"], ["dog"]]) # add more document (extend the vocabulary) >>> dct.doc2bow(["dog", "computer", "non_existent_word"]) [(0, 1), (6, 1)] @@ -79,12 +81,12 @@ def __init__(self, documents=None, prune_at=2000000): self.add_documents(documents, prune_at=prune_at) def __getitem__(self, tokenid): - """Get token by provided `tokenid`. + """Get the string token that corresponds to `tokenid`. Parameters ---------- tokenid : int - Id of token + Id of token. Returns ------- @@ -94,7 +96,7 @@ def __getitem__(self, tokenid): Raises ------ KeyError - If `tokenid` isn't contained in :class:`~gensim.corpora.dictionary.Dictionary`. + If this Dictionary doesn't contain such `tokenid`. """ if len(self.id2token) != len(self.token2id): @@ -104,7 +106,7 @@ def __getitem__(self, tokenid): return self.id2token[tokenid] # will throw for non-existent ids def __iter__(self): - """Iterate over tokens that stored.""" + """Iterate over all tokens.""" return iter(self.keys()) if PY3: @@ -145,7 +147,9 @@ def __str__(self): @staticmethod def from_documents(documents): - """Create :class:`~gensim.corpora.dictionary.Dictionary` based on `documents` + """Create :class:`~gensim.corpora.dictionary.Dictionary` from `documents`. + + Equivalent to `Dictionary(documents=documents)`. Parameters ---------- @@ -155,7 +159,7 @@ def from_documents(documents): Returns ------- :class:`~gensim.corpora.dictionary.Dictionary` - Dictionary filled by `documents`. + Dictionary initialized from `documents`. """ return Dictionary(documents=documents) @@ -168,18 +172,17 @@ def add_documents(self, documents, prune_at=2000000): documents : iterable of iterable of str Input corpus. All tokens should be already **tokenized and normalized**. prune_at : int, optional - Total number of unique words. Dictionary will keep not more than `prune_at` words. + Dictionary will keep no more than `prune_at` words in its mapping, to limit its RAM footprint. Examples -------- >>> from gensim.corpora import Dictionary >>> >>> corpus = ["máma mele maso".split(), "ema má máma".split()] - >>> dct = Dictionary(corpus) >>> len(dct) 5 - >>> dct.add_documents([["this","is","sparta"],["just","joking"]]) + >>> dct.add_documents([["this", "is", "sparta"], ["just", "joking"]]) >>> len(dct) 10 @@ -200,21 +203,21 @@ def add_documents(self, documents, prune_at=2000000): ) def doc2bow(self, document, allow_update=False, return_missing=False): - """Convert `document` into the bag-of-words (BoW) format = list of (token_id, token_count). + """Convert `document` into the bag-of-words (BoW) format = list of `(token_id, token_count)` tuples. Parameters ---------- - document : list of str + document : list of str Input document. allow_update : bool, optional - If True - update dictionary in the process (i.e. add new tokens and update frequencies). + Update self, by adding new tokens from `document` and updating internal corpus statistics. return_missing : bool, optional - Also return missing tokens (that doesn't contains in current dictionary). + Return missing tokens (tokens present in `document` but not in self) with frequencies? Return ------ list of (int, int) - BoW representation of `document` + BoW representation of `document`. list of (int, int), dict of (str, int) If `return_missing` is True, return BoW representation of `document` + dictionary with missing tokens and their frequencies. @@ -223,9 +226,9 @@ def doc2bow(self, document, allow_update=False, return_missing=False): -------- >>> from gensim.corpora import Dictionary >>> dct = Dictionary(["máma mele maso".split(), "ema má máma".split()]) - >>> dct.doc2bow(["this","is","máma"]) + >>> dct.doc2bow(["this", "is", "máma"]) [(2, 1)] - >>> dct.doc2bow(["this","is","máma"], return_missing=True) + >>> dct.doc2bow(["this", "is", "máma"], return_missing=True) ([(2, 1)], {u'this': 1, u'is': 1}) """ @@ -265,9 +268,6 @@ def doc2bow(self, document, allow_update=False, return_missing=False): def doc2idx(self, document, unknown_word_index=-1): """Convert `document` (a list of words) into a list of indexes = list of `token_id`. - - Notes - ----- Replace all unknown words i.e, words not in the dictionary with the index as set via `unknown_word_index`. Parameters @@ -280,7 +280,7 @@ def doc2idx(self, document, unknown_word_index=-1): Returns ------- list of int - Indexes in the dictionary for words in the `document` (preserving the order of words). + Token ids for tokens in `document`, in the same order. Examples -------- @@ -299,7 +299,7 @@ def doc2idx(self, document, unknown_word_index=-1): return [self.token2id.get(word, unknown_word_index) for word in document] def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None): - """Filter tokens in dictionary by frequency. + """Filter out tokens in the dictionary by their frequency. Parameters ---------- @@ -315,15 +315,15 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=N Notes ----- - For tokens that appear in: - - #. Less than `no_below` documents (absolute number) or \n - #. More than `no_above` documents (fraction of total corpus size, **not absolute number**). - #. After (1) and (2), keep only the first `keep_n` most frequent tokens (or keep all if `None`). + This removes all tokens in the dictionary that are: + #. Less frequent than `no_below` documents (absolute number, e.g. `5`) or \n + #. More frequent than `no_above` documents (fraction of the total corpus size, e.g. `0.3`). + #. After (1) and (2), keep only the first `keep_n` most frequent tokens (or keep all if `keep_n=None`). - After the pruning, shrink resulting gaps in word ids. - Due to the gap shrinking, the same word may have a different word id before and after the call to this function! + After the pruning, resulting gaps in word ids are shrunk. + Due to this gap shrinking, **the same word may have a different word id before and after the call + to this function!** Examples -------- @@ -400,7 +400,8 @@ def filter_n_most_frequent(self, remove_n): def filter_tokens(self, bad_ids=None, good_ids=None): """Remove the selected `bad_ids` tokens from :class:`~gensim.corpora.dictionary.Dictionary`. - Alternative - keep selected `good_ids` in :class:`~gensim.corpora.dictionary.Dictionary` and remove the rest. + + Alternatively, keep selected `good_ids` in :class:`~gensim.corpora.dictionary.Dictionary` and remove the rest. Parameters ---------- @@ -438,7 +439,7 @@ def filter_tokens(self, bad_ids=None, good_ids=None): self.compactify() def compactify(self): - """Assign new word ids to all words, shrinking gaps.""" + """Assign new word ids to all words, shrinking any gaps.""" logger.debug("rebuilding dictionary, shrinking gaps") # build mapping from old id -> new id @@ -457,7 +458,7 @@ def save_as_text(self, fname, sort_by_word=True): fname : str Path to output file. sort_by_word : bool, optional - if True - sort by word in lexicographical order. + Sort words in lexicographical order before writing them out? Notes ----- @@ -469,14 +470,15 @@ def save_as_text(self, fname, sort_by_word=True): .... id_k[TAB]word_k[TAB]document_frequency_k[NEWLINE] - Warnings - -------- - Text format should be use for corpus inspection. Use :meth:`~gensim.corpora.dictionary.Dictionary.save` and - :meth:`~gensim.corpora.dictionary.Dictionary.load` to store in binary format (pickle) for better performance. + This text format is great for corpus inspection and debugging. As plaintext, it's also easily portable + to other tools and frameworks. For better performance and to store the entire object state, + including collected corpus statistics, use :meth:`~gensim.corpora.dictionary.Dictionary.save` and + :meth:`~gensim.corpora.dictionary.Dictionary.load` instead. See Also -------- :meth:`~gensim.corpora.dictionary.Dictionary.load_from_text` + Load :class:`~gensim.corpora.dictionary.Dictionary` from text file. Examples -------- @@ -489,7 +491,7 @@ def save_as_text(self, fname, sort_by_word=True): >>> dct = Dictionary(corpus) >>> dct.save_as_text(tmp_fname) >>> - >>> loaded_dct = Dictionary.load_from_text("testdata") + >>> loaded_dct = Dictionary.load_from_text(tmp_fname) >>> assert dct.token2id == loaded_dct.token2id """ @@ -507,23 +509,20 @@ def save_as_text(self, fname, sort_by_word=True): fout.write(utils.to_utf8(line)) def merge_with(self, other): - """Merge another dictionary into this dictionary, mapping same tokens to the same ids and new tokens to new ids. + """Merge another dictionary into this dictionary, mapping the same tokens to the same ids + and new tokens to new ids. Notes ----- The purpose is to merge two corpora created using two different dictionaries: `self` and `other`. `other` can be any id=>word mapping (a dict, a Dictionary object, ...). - Get a transformation object which, when accessed as `result[doc_from_other_corpus]`, will convert documents + Return a transformation object which, when accessed as `result[doc_from_other_corpus]`, will convert documents from a corpus built using the `other` dictionary into a document using the new, merged dictionary. - Warnings - -------- - This method will change `self` dictionary. - Parameters ---------- - other : :class:`~gensim.corpora.dictionary.Dictionary` + other : {dict, :class:`~gensim.corpora.dictionary.Dictionary`} Other dictionary. Return @@ -571,16 +570,18 @@ def merge_with(self, other): @staticmethod def load_from_text(fname): """Load a previously stored :class:`~gensim.corpora.dictionary.Dictionary` from a text file. + Mirror function to :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`. Parameters ---------- fname: str - Path to file produced by :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`. + Path to a file produced by :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`. See Also -------- :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text` + Save :class:`~gensim.corpora.dictionary.Dictionary` to text file. Examples -------- @@ -593,7 +594,7 @@ def load_from_text(fname): >>> dct = Dictionary(corpus) >>> dct.save_as_text(tmp_fname) >>> - >>> loaded_dct = Dictionary.load_from_text("testdata") + >>> loaded_dct = Dictionary.load_from_text(tmp_fname) >>> assert dct.token2id == loaded_dct.token2id """ diff --git a/gensim/corpora/hashdictionary.py b/gensim/corpora/hashdictionary.py index a478c5e79f..85922d16c7 100644 --- a/gensim/corpora/hashdictionary.py +++ b/gensim/corpora/hashdictionary.py @@ -4,10 +4,8 @@ # Copyright (C) 2012 Homer Strong, Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -""" -This module implements the "hashing trick" [1]_ -- a mapping between words and their integer ids -using a fixed, static mapping (hash function). +"""Implements the `"hashing trick" `_ -- a mapping between words +and their integer ids using a fixed, static mapping (hash function). Notes ----- @@ -27,12 +25,6 @@ * Multiple words may map to the same id, causing hash collisions. The word <-> id mapping is no longer a bijection. - -References ----------- - -.. [1] http://en.wikipedia.org/wiki/Hashing-Trick - """ from __future__ import with_statement @@ -49,11 +41,7 @@ class HashDictionary(utils.SaveLoad, dict): - """ - Mapping between words and their integer ids, using a hashing function. - - Notes - ----- + """Mapping between words and their integer ids, using a hashing function. Unlike :class:`~gensim.corpora.dictionary.Dictionary`, building a :class:`~gensim.corpora.hashdictionary.HashDictionary` before using it **isn't a necessary step**. @@ -62,7 +50,6 @@ class HashDictionary(utils.SaveLoad, dict): Examples -------- - >>> from gensim.corpora import HashDictionary >>> >>> dct = HashDictionary(debug=False) # needs no training corpus! @@ -77,17 +64,16 @@ def __init__(self, documents=None, id_range=32000, myhash=zlib.adler32, debug=Tr Parameters ---------- - - documents : iterable of iterable of str + documents : iterable of iterable of str, optional Iterable of documents. If given, used to collect additional corpus statistics. :class:`~gensim.corpora.hashdictionary.HashDictionary` can work without these statistics (optional parameter). id_range : int, optional Number of hash-values in table, used as `id = myhash(key) %% id_range`. - myhash : function + myhash : function, optional Hash function, should support interface `myhash(str) -> int`, uses `zlib.adler32` by default. - debug : bool - If True - store which tokens have mapped to a given id. **Will use a lot of RAM**. + debug : bool, optional + Store which tokens have mapped to a given id? **Will use a lot of RAM**. If you find yourself running out of memory (or not sure that you really need raw tokens), keep `debug=False`. @@ -113,17 +99,18 @@ def __init__(self, documents=None, id_range=32000, myhash=zlib.adler32, debug=Tr def __getitem__(self, tokenid): """Get all words that have mapped to the given id so far, as a set. - Works only if you initialized your `HashDictionary` object with `debug=True`. + Warnings + -------- + Works only if you initialized your :class:`~gensim.corpora.hashdictionary.HashDictionary` object + with `debug=True`. Parameters ---------- - tokenid : int Token identifier (result of hashing). Return ------ - set of str Set of all words that have mapped to this id. @@ -136,13 +123,11 @@ def restricted_hash(self, token): Parameters ---------- - token : str Input token. Return ------ - int Hash value of `token`. @@ -169,17 +154,18 @@ def from_documents(*args, **kwargs): return HashDictionary(*args, **kwargs) def add_documents(self, documents): - """Collect corpus statistics from a corpus. Useful only if `debug=True`, to build - the reverse `id=>set(words)` mapping. + """Collect corpus statistics from a corpus. + + Warnings + -------- + Useful only if `debug=True`, to build the reverse `id=>set(words)` mapping. Notes ----- - This is only a convenience wrapper for calling `doc2bow` on each document with `allow_update=True`. Parameters ---------- - documents : iterable of list of str Collection of documents. @@ -208,8 +194,8 @@ def add_documents(self, documents): ) def doc2bow(self, document, allow_update=False, return_missing=False): - """Convert a sequence of words `document` into the bag-of-words format of - `[(word_id, word_count)]` (e.g. `[(1, 4), (150, 1), (2005, 2)]`). + """Convert a sequence of words `document` into the bag-of-words format of `[(word_id, word_count)]` + (e.g. `[(1, 4), (150, 1), (2005, 2)]`). Notes ----- @@ -225,8 +211,8 @@ def doc2bow(self, document, allow_update=False, return_missing=False): document : sequence of str A sequence of word tokens = **tokenized and normalized** strings. allow_update : bool, optional - If True - update corpus statistics and if `debug=True`, also the reverse id=>word mapping. - return_missing : bool + Update corpus statistics and if `debug=True`, also the reverse id=>word mapping? + return_missing : bool, optional Not used. Only here for compatibility with the Dictionary class. Return @@ -272,15 +258,18 @@ def doc2bow(self, document, allow_update=False, return_missing=False): return result def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): - """Filter tokens in the debug dictionary by their frequency. Only makes sense when `debug=True`. + """Filter tokens in the debug dictionary by their frequency. Since :class:`~gensim.corpora.hashdictionary.HashDictionary` id range is fixed and doesn't depend on the number - of tokens seen, this doesn't really "remove" anything. - It only clears some supplementary statistics, for easier debugging and a smaller RAM footprint. + of tokens seen, this doesn't really "remove" anything. It only clears some + internal corpus statistics, for easier debugging and a smaller RAM footprint. + + Warnings + -------- + Only makes sense when `debug=True`. Parameters ---------- - no_below : int, optional Keep tokens which are contained in at least `no_below` documents. no_above : float, optional @@ -291,25 +280,12 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): Notes ----- - For tokens that appear in: #. Less than `no_below` documents (absolute number) or \n #. More than `no_above` documents (fraction of total corpus size, **not absolute number**). #. After (1) and (2), keep only the first `keep_n` most frequent tokens (or keep all if `None`). - Examples - -------- - - >>> from gensim.corpora import HashDictionary - >>> - >>> dct = HashDictionary(debug=True) - >>> - >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] - >>> dct.filter_extremes(no_below=1, no_above=0.5, keep_n=1) - >>> print dct.token2id - {'maso': 15025} - """ no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold ok = [item for item in iteritems(self.dfs_debug) if no_below <= item[1] <= no_above_abs] @@ -330,24 +306,25 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): ) def save_as_text(self, fname): - """Save the debug token=>id mapping to a text file. Only makes sense when `debug=True`, for debugging. + """Save the debug token=>id mapping to a text file. + + Warnings + -------- + Only makes sense when `debug=True`, for debugging. Parameters ---------- - fname : str Path to output file. Notes ----- - The format is: `id[TAB]document frequency of this id[TAB]tab-separated set of words in UTF8 that map to this id[NEWLINE]`. Examples -------- - >>> from gensim.corpora import HashDictionary >>> from gensim.test.utils import get_tmpfile >>> diff --git a/gensim/corpora/mmcorpus.py b/gensim/corpora/mmcorpus.py index d7770607e8..92048bb67d 100644 --- a/gensim/corpora/mmcorpus.py +++ b/gensim/corpora/mmcorpus.py @@ -4,8 +4,7 @@ # Copyright (C) 2010 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""Corpus in the Matrix Market format.""" +"""Corpus in the `Matrix Market format `_.""" import logging @@ -17,24 +16,28 @@ class MmCorpus(matutils.MmReader, IndexedCorpus): - """Corpus in matrix market format. + """Corpus serialized using the `sparse coordinate Matrix Market format + `_. Wrap a term-document matrix on disk (in matrix-market format), and present it - as an object which supports iteration over the rows (~documents). + as an object which supports iteration over the matrix rows (~documents). + + Notable instance attributes: Attributes - ---------- + ------------------ num_docs : int - Number of documents in market matrix file. + Number of documents in the market matrix file. num_terms : int - Number of terms. + Number of features (terms, topics). num_nnz : int - Number of non-zero terms. + Number of non-zero elements in the sparse MM matrix. Notes - ---------- - Note that the file is read into memory one document at a time, not the whole matrix at once - (unlike :meth:`~scipy.io.mmread`). This allows us to process corpora which are larger than the available RAM. + ----- + The file is read into memory one document at a time, not the whole matrix at once, + unlike e.g. `scipy.io.mmread` and other implementations. This allows you to **process corpora which are larger + than the available RAM**, in a streamed manner. Example -------- @@ -47,7 +50,6 @@ class MmCorpus(matutils.MmReader, IndexedCorpus): ... pass """ - def __init__(self, fname): """ @@ -55,27 +57,26 @@ def __init__(self, fname): ---------- fname : {str, file-like object} Path to file in MM format or a file-like object that supports `seek()` - (e.g. :class:`gzip.GzipFile`, :class:`bz2.BZ2File`). + (e.g. a compressed file opened by `smart_open `_). """ - # avoid calling super(), too confusing IndexedCorpus.__init__(self, fname) matutils.MmReader.__init__(self, fname) def __iter__(self): - """Iterate through document. + """Iterate through all documents. Yields ------ - list of (int, str) - Document in BoW format. + list of (int, numeric) + Document in the `sparse Gensim bag-of-words format `__. Notes ------ The total number of vectors returned is always equal to the number of rows specified in the header. Empty documents are inserted and yielded where appropriate, even if they are not explicitly stored in the - Matrix Market file. + (sparse) Matrix Market file. """ for doc_id, doc in super(MmCorpus, self).__iter__(): @@ -83,7 +84,7 @@ def __iter__(self): @staticmethod def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): - """Save a corpus in the Matrix Market format to disk. + """Save a corpus to disk in the sparse coordinate Matrix Market format. Parameters ---------- @@ -92,16 +93,17 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): corpus : iterable of list of (int, number) Corpus in Bow format. id2word : dict of (int, str), optional - WordId -> Word. + Mapping between word_id -> word. Used to retrieve the total vocabulary size if provided. + Otherwise, the total vocabulary size is estimated based on the highest feature id encountered in `corpus`. progress_cnt : int, optional - Progress counter. + How often to report (log) progress. metadata : bool, optional - If true, writes out additional metadata. + Writes out additional metadata? - Notes - ----- - This function is automatically called by `MmCorpus.serialize`; don't - call it directly, call `serialize` instead. + Warnings + -------- + This function is automatically called by :class:`~gensim.corpora.mmcorpus.MmCorpus.serialize`, don't + call it directly, call :class:`~gensim.corpora.mmcorpus.MmCorpus.serialize` instead. Example ------- diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 1c088b9416..966c5d7924 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -6,19 +6,16 @@ # Copyright (C) 2018 Emmanouil Stergiadis # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - """Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump. +Uses multiprocessing internally to parallelize the work and process the dump more quickly. + Notes ----- -If you have the `pattern` package installed, this module will use a fancy lemmatization to get a lemma -of each token (instead of plain alphabetic tokenizer). The package is available at [1]_ . - -See :mod:`~gensim.scripts.make_wiki` for a canned (example) script based on this module. +If you have the `pattern `_ package installed, +this module will use a fancy lemmatization to get a lemma of each token (instead of plain alphabetic tokenizer). -References ----------- -.. [1] https://github.com/clips/pattern +See :mod:`gensim.scripts.make_wiki` for a canned (example) command-line script based on this module. """ @@ -89,13 +86,7 @@ 'MediaWiki', 'User', 'Help', 'Book', 'Draft', 'WikiProject', 'Special', 'Talk' ] -"""MediaWiki namespaces [2]_ that ought to be ignored. - -References ----------- -.. [2] https://www.mediawiki.org/wiki/Manual:Namespace - -""" +"""`MediaWiki namespaces `_ that ought to be ignored.""" def find_interlinks(raw): @@ -110,6 +101,7 @@ def find_interlinks(raw): ------- dict Mapping from the linked article to the actual text found. + """ filtered = filter_wiki(raw, promote_remaining=False, simplify_links=False) interlinks_raw = re.findall(RE_P16, filtered) @@ -143,6 +135,7 @@ def filter_wiki(raw, promote_remaining=True, simplify_links=True): ------- str `raw` without markup. + """ # parsing of the wiki markup is not perfect, but sufficient for our purposes # contributions to improving this code are welcome :) @@ -221,18 +214,13 @@ def remove_template(s): Returns ------- str - Сopy of `s` with all the wikimedia markup template removed. See [4]_ for wikimedia templates details. + Сopy of `s` with all the `wikimedia markup template `_ removed. Notes ----- Since template can be nested, it is difficult remove them using regular expressions. - References - ---------- - .. [4] http://meta.wikimedia.org/wiki/Help:Template - """ - # Find the start and end position of each template by finding the opening # '{{' and closing '}}' n_open, n_close = 0, 0 @@ -271,11 +259,8 @@ def remove_file(s): Returns ------- str - Сopy of `s` with all the 'File:' and 'Image:' markup replaced by their corresponding captions. [3]_ - - References - ---------- - .. [3] http://www.mediawiki.org/wiki/Help:Images + Сopy of `s` with all the 'File:' and 'Image:' markup replaced by their `corresponding captions + `_. """ # The regex RE_P15 match a File: or Image: markup @@ -287,7 +272,7 @@ def remove_file(s): def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): - """Tokenize a piece of text from wikipedia. + """Tokenize a piece of text from Wikipedia. Set `token_min_len`, `token_max_len` as character length (not bytes!) thresholds for individual tokens. @@ -300,7 +285,7 @@ def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, token_max_len : int Maximal token length. lower : bool - If True - convert `content` to lower case. + Convert `content` to lower case? Returns ------- @@ -398,12 +383,12 @@ def extract_pages(f, filter_namespaces=False): def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): - """Parse a wikipedia article, extract all tokens. + """Parse a Wikipedia article, extract all tokens. Notes ----- Set `tokenizer_func` (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`) parameter for languages - like japanese or thai to perform better tokenization. + like Japanese or Thai to perform better tokenization. The `tokenizer_func` needs to take 4 parameters: (text: str, token_min_len: int, token_max_len: int, lower: bool). Parameters @@ -420,7 +405,7 @@ def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN, token_max_len : int Maximal token length. lower : bool - If True - convert article text to lower case. + Convert article text to lower case? Returns ------- @@ -478,7 +463,7 @@ def _process_article(args): class WikiCorpus(TextCorpus): - """Treat a wikipedia articles dump as a **read-only** corpus. + """Treat a Wikipedia articles dump as a read-only, streamed, memory-efficient corpus. Supported dump formats: @@ -489,7 +474,7 @@ class WikiCorpus(TextCorpus): Notes ----- - Dumps for English wikipedia can be founded `here `_. + Dumps for the English Wikipedia can be founded at https://dumps.wikimedia.org/enwiki/. Attributes ---------- @@ -503,39 +488,39 @@ class WikiCorpus(TextCorpus): Examples -------- + >>> from gensim.test.utils import datapath, get_tmpfile >>> from gensim.corpora import WikiCorpus, MmCorpus >>> - >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h - >>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format and mapping + >>> path_to_wiki_dump = datapath("enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2") + >>> corpus_path = get_tmpfile("wiki-corpus.mm") + >>> + >>> wiki = WikiCorpus(path_to_wiki_dump) # create word->word_id mapping, ~8h on full wiki + >>> MmCorpus.serialize(corpus_path, wiki) # another 8h, creates a file in MatrixMarket format and mapping """ - def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): - """Initialize the corpus. - - Unless a dictionary is provided, this scans the corpus once, - to determine its vocabulary. + """ Parameters ---------- fname : str - Path to file with wikipedia dump. + Path to the Wikipedia dump file. processes : int, optional - Number of processes to run, defaults to **number of cpu - 1**. + Number of processes to run, defaults to `max(1, number of cpu - 1)`. lemmatize : bool - Whether to use lemmatization instead of simple regexp tokenization. - Defaults to `True` if *pattern* package installed. + Use lemmatization instead of simple regexp tokenization. + Defaults to `True` if you have the `pattern `_ package installed. dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional Dictionary, if not provided, this scans the corpus once, to determine its vocabulary - (this needs **really long time**). - filter_namespaces : tuple of str + **IMPORTANT: this needs a really long time**. + filter_namespaces : tuple of str, optional Namespaces to consider. tokenizer_func : function, optional Function that will be used for tokenization. By default, use :func:`~gensim.corpora.wikicorpus.tokenize`. - Need to support interface: - tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str. + If you inject your own tokenizer, it must conform to this interface: + `tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str` article_min_tokens : int, optional Minimum tokens in article. Article will be ignored if number of tokens is less. token_min_len : int, optional @@ -543,7 +528,11 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction token_max_len : int, optional Maximal token length. lower : bool, optional - If True - convert all text to lower case. + Convert all text to lower case? + + Warnings + -------- + Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary. """ self.fname = fname @@ -565,15 +554,25 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction self.dictionary = dictionary def get_texts(self): - """Iterate over the dump, yielding list of tokens for each article. + """Iterate over the dump, yielding a list of tokens for each article that passed + the length and namespace filtering. + + Uses multiprocessing internally to parallelize the work and process the dump more quickly. Notes ----- This iterates over the **texts**. If you want vectors, just use the standard corpus interface instead of this method: - >>> for vec in wiki_corpus: - >>> print(vec) + Examples + -------- + >>> from gensim.test.utils import datapath + >>> from gensim.corpora import WikiCorpus + >>> + >>> path_to_wiki_dump = datapath("enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2") + >>> + >>> for vec in WikiCorpus(path_to_wiki_dump): + ... pass Yields ------ @@ -583,7 +582,6 @@ def get_texts(self): List of tokens (extracted from the article), page id and article title otherwise. """ - articles, articles_all = 0, 0 positions, positions_all = 0, 0 diff --git a/gensim/interfaces.py b/gensim/interfaces.py index 0261c290f9..327dc9c960 100644 --- a/gensim/interfaces.py +++ b/gensim/interfaces.py @@ -4,11 +4,13 @@ # Copyright (C) 2010 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""This module contains implementations of basic interfaces used across the whole gensim package. -These interfaces usable for building corpus, transformation and similarity classes. +"""Basic interfaces used across the whole Gensim package. -All interfaces are realized as abstract base classes (i.e. some optional functionality is provided in the interface -itself, so that the interfaces should be inherited). +These interfaces are used for building corpora, model transformation and similarity queries. + +The interfaces are realized as abstract base classes. This means some functionality is already +provided in the interface itself, and subclasses should inherit from these interfaces +and implement the missing methods. """ @@ -68,16 +70,8 @@ class CorpusABC(utils.SaveLoad): Corpuses in different formats """ - def __iter__(self): - """Iterate over corpus, **should be overridden in inheritor class**. - - Raises - ------ - NotImplementedError - Since it's abstract class this iterator protocol should be overwritten in the inherited class. - - """ + """Iterate all over corpus.""" raise NotImplementedError('cannot instantiate abstract base class') def save(self, *args, **kwargs): @@ -85,51 +79,40 @@ def save(self, *args, **kwargs): Warnings -------- - This save only "state" of corpus class (not corpus-data at all), - for saving data please use :meth:`~gensim.interfaces.CorpusABC.save_corpus` instead`. + This save only the "state" of a corpus class, not the corpus data! - Parameters - ---------- - *args - Variable length argument list. - **kwargs - Arbitrary keyword arguments. + For saving data use the `serialize` method of the output format you'd like to use + (e.g. :meth:`gensim.corpora.mmcorpus.MmCorpus.serialize`). """ import warnings warnings.warn( - "corpus.save() stores only the (tiny) iteration object; " + "corpus.save() stores only the (tiny) iteration object in memory; " "to serialize the actual corpus content, use e.g. MmCorpus.serialize(corpus)" ) super(CorpusABC, self).save(*args, **kwargs) def __len__(self): - """Get size of the corpus (number of documents), **should be overridden in inheritor class**. - - Raises - ------ - NotImplementedError - Since it's abstract class this method should be reimplemented later. - - """ + """Get the corpus size = the total number of documents in it.""" raise NotImplementedError("must override __len__() before calling len(corpus)") @staticmethod def save_corpus(fname, corpus, id2word=None, metadata=False): - """Saves given `corpus` to disk, **should be overridden in inheritor class**. + """Save `corpus` to disk. Some formats support saving the dictionary (`feature_id -> word` mapping), which can be provided by the optional `id2word` parameter. Notes ----- - Some corpus also support an index of where each document begins, so that the documents on disk + Some corpora also support random access via document indexing, so that the documents on disk can be accessed in O(1) time (see the :class:`gensim.corpora.indexedcorpus.IndexedCorpus` base class). + In this case, :meth:`~gensim.interfaces.CorpusABC.save_corpus` is automatically called internally by :func:`serialize`, which does :meth:`~gensim.interfaces.CorpusABC.save_corpus` plus saves the index at the same time. - Calling :func:`serialize() is preferred to calling :meth:`~gensim.interfaces.CorpusABC.save_corpus`. + Calling :func:`serialize() is preferred to calling :meth:`gensim.interfaces.CorpusABC.save_corpus`. Parameters ---------- @@ -140,27 +123,26 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): id2word : :class:`~gensim.corpora.Dictionary`, optional Dictionary of corpus. metadata : bool, optional - If True, will write some meta-information to `fname` too. + Write additional metadata to a separate too? """ raise NotImplementedError('cannot instantiate abstract base class') class TransformedCorpus(CorpusABC): - """Interface for corpus supports transformations.""" + """Interface for corpora that are the result of an online (streamed) transformation.""" def __init__(self, obj, corpus, chunksize=None, **kwargs): """ Parameters ---------- obj : object - Some corpus class from :mod:`gensim.corpora`. + A transformation :class:`~gensim.interfaces.TransformationABC` object that will be applied + to each document from `corpus` during iteration. corpus : iterable of list of (int, number) - Corpus in BoW format. + Corpus in bag-of-words format. chunksize : int, optional - If provided - more effective processing (by group of documents) will performed. - kwargs - Arbitrary keyword arguments. + If provided, a slightly more effective processing will be performed by grouping documents from `corpus`. """ self.obj, self.corpus, self.chunksize = obj, corpus, chunksize @@ -170,18 +152,18 @@ def __init__(self, obj, corpus, chunksize=None, **kwargs): self.metadata = False def __len__(self): - """Get size of the corpus.""" + """Get corpus size.""" return len(self.corpus) def __iter__(self): - """Iterate over the corpus. + """Iterate over the corpus, applying the selected transformation. - If `chunksize` is set, works in "batch-manner" (more efficient). + If `chunksize` was set in the constructor, works in "batch-manner" (more efficient). Yields ------ list of (int, number) - Document in BoW format + Documents in the sparse Gensim bag-of-words format. """ if self.chunksize: @@ -193,22 +175,26 @@ def __iter__(self): yield self.obj[doc] def __getitem__(self, docno): - """Get element from corpus index `docno`. + """Transform the document at position `docno` within `corpus` specified in the constructor. Parameters ---------- docno : int - Index of document in corpus. + Position of the document to transform. Document offset inside `self.corpus`. + + Notes + ----- + `self.corpus` must support random indexing. Returns ------- list of (int, number) - Document in BoW format + Transformed document in the sparse Gensim bag-of-words format. Raises ------ RuntimeError - If corpus doesn't support slicing (:meth`__getitem__` doesn't exists). + If corpus doesn't support index slicing (`__getitem__` doesn't exists). """ if hasattr(self.corpus, '__getitem__'): @@ -227,26 +213,17 @@ class TransformationABC(utils.SaveLoad): >>> from gensim.test.utils import common_dictionary, common_corpus >>> >>> model = LsiModel(common_corpus, id2word=common_dictionary) - >>> bow_vector = model[common_corpus[0]] # model applied through __getitem__ on document from corpus. - >>> bow_corpus = model[common_corpus] # also, we can apply model on full corpus - + >>> bow_vector = model[common_corpus[0]] # model applied through __getitem__ on one document from corpus. + >>> bow_corpus = model[common_corpus] # also, we can apply model on the full corpus """ - def __getitem__(self, vec): - """Get element of `transformations`, **should be overridden in inheritor class**. - - Transforms vector from one vector space into another **or** whole corpus into another. + """Transform a single document, or a whole corpus, from one vector space into another. Parameters ---------- - vec : object - Given vector. - - Raises - ------ - NotImplementedError - Since it's abstract class this method should be reimplemented later. + vec : {list of (int, number), iterable of list of (int, number)} + Document in bag-of-words, or streamed corpus. """ raise NotImplementedError('cannot instantiate abstract base class') @@ -257,11 +234,9 @@ def _apply(self, corpus, chunksize=None, **kwargs): Parameters ---------- corpus : iterable of list of (int, number) - Corpus in BoW format. + Corpus in sparse Gensim bag-of-words format. chunksize : int, optional - If provided - more effective processing (by group of documents) will performed. - kwargs - Arbitrary keyword arguments. + If provided, a more effective processing will performed. Returns ------- @@ -276,10 +251,9 @@ class SimilarityABC(utils.SaveLoad): """Interface for similarity search over a corpus. In all instances, there is a corpus against which we want to perform the similarity search. - For each similarity search, the input is a document and the output are its similarities + For each similarity search, the input is a document or a corpus, and the output are the similarities to individual corpus documents. - Examples -------- >>> from gensim.similarities import MatrixSimilarity @@ -296,56 +270,45 @@ class SimilarityABC(utils.SaveLoad): See Also -------- :mod:`gensim.similarities` - Provided different type of indexes for search. + Different index implementations of this interface. """ - def __init__(self, corpus): - """Initialization of object, **should be overridden in inheritor class**. + """ Parameters ---------- corpus : iterable of list of (int, number) - Corpus in BoW format. - - Raises - ------ - NotImplementedError - Since it's abstract class this method should be reimplemented later. + Corpus in sparse Gensim bag-of-words format. """ raise NotImplementedError("cannot instantiate Abstract Base Class") def get_similarities(self, doc): - """Get similarity measures of documents of corpus to given `doc`, **should be overridden in inheritor class**. + """Get similarities of the given document or corpus against this index. Parameters ---------- - doc : list of (int, number) - Document in BoW format. - - Raises - ------ - NotImplementedError - Since it's abstract class this method should be reimplemented later. + doc : {list of (int, number), iterable of list of (int, number)} + Document in the sparse Gensim bag-of-words format, or a streamed corpus of such documents. """ raise NotImplementedError("cannot instantiate Abstract Base Class") def __getitem__(self, query): - """Get access to similarities of document/corpus `query` to all documents in the corpus. - - Using :meth:`~gensim.interfaces.SimilarityABC.get_similarities` + """Get similarities of the given document or corpus against this index. + Uses :meth:`~gensim.interfaces.SimilarityABC.get_similarities` internally. Notes ----- - Passing corpus to `query` (instead of document) can be more efficient, because will processed in batching-way. + Passing an entire corpus as `query` can be more efficient than passing its documents one after another, + because it will issue queries in batches internally. Parameters ---------- - query : {list of (int, int), iterable of list of (int, int)} - Document or corpus in BoW format. + query : {list of (int, number), iterable of list of (int, number)} + Document in the sparse Gensim bag-of-words format, or a streamed corpus of such documents. Returns ------- @@ -383,12 +346,12 @@ def __getitem__(self, query): return matutils.full2sparse_clipped(result, self.num_best) def __iter__(self): - """Iterate over all documents, computes similarity against all other documents in the index. + """Iterate over all documents, compute similarity of each document against all other documents in the index. Yields ------ {`scipy.sparse.csr.csr_matrix`, list of (int, float)} - Similarity of current document and all documents of corpus. + Similarity of the current document and all documents in the corpus. """ # turn off query normalization (vectors in the index are assumed to be already normalized) diff --git a/gensim/matutils.py b/gensim/matutils.py index 80fd1e8c29..777de46817 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -4,7 +4,7 @@ # Copyright (C) 2011 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""This module contains math helper functions.""" +"""Math helper functions.""" from __future__ import with_statement @@ -31,41 +31,42 @@ def blas(name, ndarray): - """Helper for getting BLAS function, used :func:`scipy.linalg.get_blas_funcs`. + """Helper for getting the appropriate BLAS function, using :func:`scipy.linalg.get_blas_funcs`. Parameters ---------- name : str - Name(s) of BLAS functions without type prefix. + Name(s) of BLAS functions, without the type prefix. ndarray : numpy.ndarray Arrays can be given to determine optimal prefix of BLAS routines. Returns ------- - fortran object - Fortran function for needed operation. + object + BLAS function for the needed operation on the given data type. """ return scipy.linalg.get_blas_funcs((name,), (ndarray,))[0] def argsort(x, topn=None, reverse=False): - """Get indices of the `topn` smallest elements in array `x`. + """Efficiently calculate indices of the `topn` smallest elements in array `x`. Parameters ---------- x : array_like - Array to sort. + Array to get the smallest element indices from. topn : int, optional - Number of indices of the smallest(greatest) elements to be returned if given, - otherwise - indices of all elements will be returned in ascending(descending) order. + Number of indices of the smallest (greatest) elements to be returned. + If not given, indices of all elements will be returned in ascending (descending) order. reverse : bool, optional - If True - return the `topn` greatest elements, in descending order. + Return the `topn` greatest elements in descending order, + instead of smallest elements in ascending order? Returns ------- numpy.ndarray - Array of `topn` indices that.sort the array in the required order. + Array of `topn` indices that sort the array in the requested order. """ x = np.asarray(x) # unify code path for when `x` is not a np array (list, tuple...) @@ -83,38 +84,38 @@ def argsort(x, topn=None, reverse=False): def corpus2csc(corpus, num_terms=None, dtype=np.float64, num_docs=None, num_nnz=None, printprogress=0): - """Convert a streamed corpus in BoW format into a sparse matrix `scipy.sparse.csc_matrix`, + """Convert a streamed corpus in bag-of-words format into a sparse matrix `scipy.sparse.csc_matrix`, with documents as columns. Notes ----- If the number of terms, documents and non-zero elements is known, you can pass - them here as parameters and a more memory efficient code path will be taken. + them here as parameters and a (much) more memory efficient code path will be taken. Parameters ---------- corpus : iterable of iterable of (int, number) Input corpus in BoW format num_terms : int, optional - If provided, the `num_terms` attributes in the corpus will be ignored. + Number of terms in `corpus`. If provided, the `corpus.num_terms` attribute (if any) will be ignored. dtype : data-type, optional - Data type of output matrix. + Data type of output CSC matrix. num_docs : int, optional - If provided, the `num_docs` attributes in the corpus will be ignored. + Number of documents in `corpus`. If provided, the `corpus.num_docs` attribute (in any) will be ignored. num_nnz : int, optional - If provided, the `num_nnz` attributes in the corpus will be ignored. + Number of non-zero elements in `corpus`. If provided, the `corpus.num_nnz` attribute (if any) will be ignored. printprogress : int, optional - Print progress for every `printprogress` number of documents, - If 0 - nothing will be printed. + Log a progress message at INFO level once every `printprogress` documents. 0 to turn off progress logging. Returns ------- scipy.sparse.csc_matrix - Sparse matrix inferred based on `corpus`. + `corpus` converted into a sparse CSC matrix. See Also -------- :class:`~gensim.matutils.Sparse2Corpus` + Convert sparse format to Gensim corpus format. """ try: @@ -195,7 +196,7 @@ def pad(mat, padrow, padcol): def zeros_aligned(shape, dtype, order='C', align=128): - """Get array aligned at `align` byte boundary. + """Get array aligned at `align` byte boundary in memory. Parameters ---------- @@ -221,24 +222,24 @@ def zeros_aligned(shape, dtype, order='C', align=128): def ismatrix(m): - """Check does `m` numpy.ndarray or `scipy.sparse` matrix. + """Check whether `m` is a 2D `numpy.ndarray` or `scipy.sparse` matrix. Parameters ---------- m : object - Candidate for matrix + Object to check. Returns ------- bool - True if `m` is matrix, False otherwise. + Is `m` a 2D `numpy.ndarray` or `scipy.sparse` matrix. """ return isinstance(m, np.ndarray) and m.ndim == 2 or scipy.sparse.issparse(m) def any2sparse(vec, eps=1e-9): - """Convert a numpy.ndarray or `scipy.sparse` vector into gensim BoW format. + """Convert a numpy.ndarray or `scipy.sparse` vector into the Gensim bag-of-words format. Parameters ---------- @@ -261,16 +262,16 @@ def any2sparse(vec, eps=1e-9): def scipy2scipy_clipped(matrix, topn, eps=1e-9): - """Get a `scipy.sparse` vector / matrix consisting of 'topn' elements of the greatest magnitude (absolute value). + """Get the 'topn' elements of the greatest magnitude (absolute value) from a `scipy.sparse` vector or matrix. Parameters ---------- matrix : `scipy.sparse` - Input vector / matrix. + Input vector or matrix (1D or 2D sparse array). topn : int - Number of greatest (by module) elements, that will be in result. + Number of greatest elements, in absolute value, to return. eps : float - PARAMETER IGNORED. + Ignored. Returns ------- @@ -315,12 +316,12 @@ def scipy2scipy_clipped(matrix, topn, eps=1e-9): def scipy2sparse(vec, eps=1e-9): - """Convert a scipy.sparse vector BoW format. + """Convert a scipy.sparse vector into the Gensim bag-of-words format. Parameters ---------- vec : `scipy.sparse` - Sparse vector + Sparse vector. eps : float, optional Value used for threshold, all coordinates less than `eps` will not be presented in result. @@ -328,7 +329,7 @@ def scipy2sparse(vec, eps=1e-9): Returns ------- list of (int, float) - Vector in BoW format. + Vector in Gensim bag-of-words format. """ vec = vec.tocsr() @@ -337,14 +338,14 @@ def scipy2sparse(vec, eps=1e-9): class Scipy2Corpus(object): - """Convert a sequence of dense/sparse vectors into a streamed gensim corpus object. + """Convert a sequence of dense/sparse vectors into a streamed Gensim corpus object. See Also -------- :func:`~gensim.matutils.corpus2csc` + Convert corpus in Gensim format to `scipy.sparse.csc` matrix. """ - def __init__(self, vecs): """ @@ -368,23 +369,25 @@ def __len__(self): def sparse2full(doc, length): - """Convert a document in BoW format into dense numpy array. + """Convert a document in Gensim bag-of-words format into a dense numpy array. Parameters ---------- doc : list of (int, number) - Document in BoW format + Document in BoW format. length : int - Length of result vector + Vector dimensionality. This cannot be inferred from the BoW, and you must supply it explicitly. + This is typically the vocabulary size or number of topics, depending on how you created `doc`. Returns ------- numpy.ndarray - Dense variant of `doc` vector. + Dense numpy vector for `doc`. See Also -------- :func:`~gensim.matutils.full2sparse` + Convert dense array to gensim bag-of-words format. """ result = np.zeros(length, dtype=np.float32) # fill with zeroes (default value) @@ -398,23 +401,25 @@ def sparse2full(doc, length): def full2sparse(vec, eps=1e-9): - """Convert a dense array into the BoW format. + """Convert a dense numpy array into the Gensim bag-of-words format. Parameters ---------- vec : numpy.ndarray - Input dense vector + Dense input vector. eps : float - Threshold value, if coordinate in `vec` < eps, this will not be presented in result. + Feature weight threshold value. Features with `abs(weight) < eps` are considered sparse and + won't be included in the BOW result. Returns ------- list of (int, float) - BoW format of `vec`. + BoW format of `vec`, with near-zero values omitted (sparse vector). See Also -------- :func:`~gensim.matutils.sparse2full` + Convert a document in Gensim bag-of-words format into a dense numpy array. """ vec = np.asarray(vec, dtype=float) @@ -428,6 +433,9 @@ def full2sparse(vec, eps=1e-9): def full2sparse_clipped(vec, topn, eps=1e-9): """Like :func:`~gensim.matutils.full2sparse`, but only return the `topn` elements of the greatest magnitude (abs). + This is more efficient that sorting a vector and then taking the greatest values, especially + where `len(vec) >> topn`. + Parameters ---------- vec : numpy.ndarray @@ -445,6 +453,7 @@ def full2sparse_clipped(vec, topn, eps=1e-9): See Also -------- :func:`~gensim.matutils.full2sparse` + Convert dense array to gensim bag-of-words format. """ # use np.argpartition/argsort and only form tuples that are actually returned. @@ -458,27 +467,29 @@ def full2sparse_clipped(vec, topn, eps=1e-9): def corpus2dense(corpus, num_terms, num_docs=None, dtype=np.float32): - """Convert corpus into a dense numpy array (documents will be columns). + """Convert corpus into a dense numpy 2D array, with documents as columns. Parameters ---------- corpus : iterable of iterable of (int, number) - Input corpus in BoW format. + Input corpus in the Gensim bag-of-words format. num_terms : int - Number of terms in dictionary (will be used as size of output vector. + Number of terms in the dictionary. X-axis of the resulting matrix. num_docs : int, optional - Number of documents in corpus. + Number of documents in the corpus. If provided, a slightly more memory-efficient code path is taken. + Y-axis of the resulting matrix. dtype : data-type, optional - Data type of output matrix + Data type of the output matrix. Returns ------- numpy.ndarray - Dense array that present `corpus`. + Dense 2D array that presents `corpus`. See Also -------- :class:`~gensim.matutils.Dense2Corpus` + Convert dense matrix to Gensim corpus format. """ if num_docs is not None: @@ -493,16 +504,18 @@ def corpus2dense(corpus, num_terms, num_docs=None, dtype=np.float32): class Dense2Corpus(object): - """Treat dense numpy array as a streamed gensim corpus in BoW format. + """Treat dense numpy array as a streamed Gensim corpus in the bag-of-words format. Notes ----- - No data copy is made (changes to the underlying matrix imply changes in the corpus). + No data copy is made (changes to the underlying matrix imply changes in the streamed corpus). See Also -------- :func:`~gensim.matutils.corpus2dense` + Convert Gensim corpus to dense matrix. :class:`~gensim.matutils.Sparse2Corpus` + Convert sparse matrix to Gensim corpus format. """ def __init__(self, dense, documents_columns=True): @@ -513,7 +526,7 @@ def __init__(self, dense, documents_columns=True): dense : numpy.ndarray Corpus in dense format. documents_columns : bool, optional - If True - documents will be column, rows otherwise. + Documents in `dense` represented as columns, as opposed to rows? """ if documents_columns: @@ -522,7 +535,7 @@ def __init__(self, dense, documents_columns=True): self.dense = dense def __iter__(self): - """Iterate over corpus + """Iterate over the corpus. Yields ------ @@ -538,12 +551,14 @@ def __len__(self): class Sparse2Corpus(object): - """Convert a matrix in scipy.sparse format into a streaming gensim corpus. + """Convert a matrix in scipy.sparse format into a streaming Gensim corpus. See Also -------- :func:`~gensim.matutils.corpus2csc` + Convert gensim corpus format to `scipy.sparse.csc` matrix :class:`~gensim.matutils.Dense2Corpus` + Convert dense matrix to gensim corpus. """ def __init__(self, sparse, documents_columns=True): @@ -554,7 +569,7 @@ def __init__(self, sparse, documents_columns=True): sparse : `scipy.sparse` Corpus scipy sparse format documents_columns : bool, optional - If True - documents will be column, rows otherwise. + Documents will be column? """ if documents_columns: @@ -578,7 +593,7 @@ def __len__(self): return self.sparse.shape[1] def __getitem__(self, document_index): - """Get a single document in the corpus by its index. + """Retrieve a document vector from the corpus by its index. Parameters ---------- @@ -597,12 +612,12 @@ def __getitem__(self, document_index): def veclen(vec): - """Calculate length of vector + """Calculate L2 (euclidean) length of a vector. Parameters ---------- vec : list of (int, number) - Input vector in BoW format. + Input vector in sparse bag-of-words format. Returns ------- @@ -618,7 +633,7 @@ def veclen(vec): def ret_normalized_vec(vec, length): - """Normalize vector. + """Normalize a vector in L2 (Euclidean unit norm). Parameters ---------- @@ -630,7 +645,7 @@ def ret_normalized_vec(vec, length): Returns ------- list of (int, number) - Normalized vector in BoW format. + L2-normalized vector in BoW format. """ if length != 1.0: @@ -674,16 +689,16 @@ def unitvec(vec, norm='l2', return_norm=False): vec : {numpy.ndarray, scipy.sparse, list of (int, float)} Input vector in any format norm : {'l1', 'l2'}, optional - Normalization that will be used. + Metric to normalize in. return_norm : bool, optional - If True - returns the length of vector `vec`. + Return the length of vector `vec`, in addition to the normalized vector itself? Returns ------- numpy.ndarray, scipy.sparse, list of (int, float)} Normalized vector in same format as `vec`. float - Length of `vec` before normalization. + Length of `vec` before normalization, if `return_norm` is set. Notes ----- @@ -752,14 +767,15 @@ def unitvec(vec, norm='l2', return_norm=False): def cossim(vec1, vec2): """Get cosine similarity between two sparse vectors. - The similarity is a number between <-1.0, 1.0>, higher is more similar. + + Cosine similarity is a number between `<-1.0, 1.0>`, higher means more similar. Parameters ---------- vec1 : list of (int, float) - Vector in BoW format + Vector in BoW format. vec2 : list of (int, float) - Vector in BoW format + Vector in BoW format. Returns ------- @@ -784,9 +800,15 @@ def softcossim(vec1, vec2, similarity_matrix): """Get Soft Cosine Measure between two vectors given a term similarity matrix. Return Soft Cosine Measure between two sparse vectors given a sparse term similarity matrix - in the :class:`scipy.sparse.csc_matrix` format. The similarity is a number between <-1.0, 1.0>, + in the :class:`scipy.sparse.csc_matrix` format. The similarity is a number between `<-1.0, 1.0>`, higher is more similar. + Notes + ----- + Soft Cosine Measure was perhaps first defined by `Grigori Sidorov et al., + "Soft Similarity and Soft Cosine Measure: Similarity of Features in Vector Space Model" + `_. + Parameters ---------- vec1 : list of (int, float) @@ -814,13 +836,6 @@ def softcossim(vec1, vec2, similarity_matrix): :class:`gensim.similarities.docsim.SoftCosineSimilarity` A class for performing corpus-based similarity queries with Soft Cosine Measure. - References - ---------- - Soft Cosine Measure was perhaps first defined by [sidorovetal14]_. - - .. [sidorovetal14] Grigori Sidorov et al., "Soft Similarity and Soft Cosine Measure: Similarity - of Features in Vector Space Model", 2014, http://www.cys.cic.ipn.mx/ojs/index.php/CyS/article/view/2043/1921. - """ if not isinstance(similarity_matrix, scipy.sparse.csc_matrix): if isinstance(similarity_matrix, scipy.sparse.csr_matrix): @@ -852,17 +867,17 @@ def softcossim(vec1, vec2, similarity_matrix): def isbow(vec): - """Checks if vector passed is in BoW format. + """Checks if a vector is in the sparse Gensim bag-of-words format. Parameters ---------- vec : object - Input vector in any format + Object to check. Returns ------- bool - True if vector in BoW format, False otherwise. + Is `vec` in BoW format. """ if scipy.sparse.issparse(vec): @@ -877,24 +892,7 @@ def isbow(vec): return True -def convert_vec(vec1, vec2, num_features=None): - """Convert vectors to dense format - - Parameters - ---------- - vec1 : {scipy.sparse, list of (int, float)} - Input vector. - vec2 : {scipy.sparse, list of (int, float)} - Input vector. - num_features : int, optional - Number of features in vector. - - Returns - ------- - (numpy.ndarray, numpy.ndarray) - (`vec1`, `vec2`) in dense format. - - """ +def _convert_vec(vec1, vec2, num_features=None): if scipy.sparse.issparse(vec1): vec1 = vec1.toarray() if scipy.sparse.issparse(vec2): @@ -929,16 +927,16 @@ def kullback_leibler(vec1, vec2, num_features=None): vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)} Distribution vector. num_features : int, optional - Number of features in vector. + Number of features in the vectors. Returns ------- float Kullback-Leibler distance between `vec1` and `vec2`. - Value in range [0, +∞) where values closer to 0 mean less distance (and a higher similarity). + Value in range [0, +∞) where values closer to 0 mean less distance (higher similarity). """ - vec1, vec2 = convert_vec(vec1, vec2, num_features=num_features) + vec1, vec2 = _convert_vec(vec1, vec2, num_features=num_features) return entropy(vec1, vec2) @@ -952,7 +950,7 @@ def jensen_shannon(vec1, vec2, num_features=None): vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)} Distribution vector. num_features : int, optional - Number of features in vector. + Number of features in the vectors. Returns ------- @@ -961,10 +959,10 @@ def jensen_shannon(vec1, vec2, num_features=None): Notes ----- - This is symmetric and finite "version" of :func:`gensim.matutils.kullback_leibler`. + This is a symmetric and finite "version" of :func:`gensim.matutils.kullback_leibler`. """ - vec1, vec2 = convert_vec(vec1, vec2, num_features=num_features) + vec1, vec2 = _convert_vec(vec1, vec2, num_features=num_features) avg_vec = 0.5 * (vec1 + vec2) return 0.5 * (entropy(vec1, avg_vec) + entropy(vec2, avg_vec)) @@ -983,7 +981,7 @@ def hellinger(vec1, vec2): ------- float Hellinger distance between `vec1` and `vec2`. - Value in range [0, 1], where 0 is min distance (max similarity) and 1 is max distance (min similarity). + Value in range `[0, 1]`, where 0 is min distance (max similarity) and 1 is max distance (min similarity). """ if scipy.sparse.issparse(vec1): @@ -1004,7 +1002,7 @@ def hellinger(vec1, vec2): def jaccard(vec1, vec2): - """Calculate Jaccard distance between vectors. + """Calculate Jaccard distance between two vectors. Parameters ---------- @@ -1017,7 +1015,7 @@ def jaccard(vec1, vec2): ------- float Jaccard distance between `vec1` and `vec2`. - Value in range [0, 1], where 0 is min distance (max similarity) and 1 is max distance (min similarity). + Value in range `[0, 1]`, where 0 is min distance (max similarity) and 1 is max distance (min similarity). """ @@ -1050,7 +1048,7 @@ def jaccard(vec1, vec2): def jaccard_distance(set1, set2): - """Calculate Jaccard distance between two sets + """Calculate Jaccard distance between two sets. Parameters ---------- @@ -1063,7 +1061,7 @@ def jaccard_distance(set1, set2): ------- float Jaccard distance between `set1` and `set2`. - Value in range [0, 1], where 0 is min distance (max similarity) and 1 is max distance (min similarity). + Value in range `[0, 1]`, where 0 is min distance (max similarity) and 1 is max distance (min similarity). """ union_cardinality = len(set1 | set2) @@ -1093,7 +1091,7 @@ def logsumexp(x): Warnings -------- - By performance reasons, doesn't support NaNs or 1d, 3d, etc arrays like :func:`scipy.special.logsumexp`. + For performance reasons, doesn't support NaNs or 1d, 3d, etc arrays like :func:`scipy.special.logsumexp`. """ x_max = np.max(x) @@ -1144,20 +1142,25 @@ def dirichlet_expectation(alpha): def qr_destroy(la): """Get QR decomposition of `la[0]`. - Notes - ----- - Using this function should be less memory intense than calling `scipy.linalg.qr(la[0])`, - because the memory used in `la[0]` is reclaimed earlier. - + Parameters + ---------- + la : list of numpy.ndarray + Run QR decomposition on the first elements of `la`. Must not be empty. Returns ------- (numpy.ndarray, numpy.ndarray) Matrices :math:`Q` and :math:`R`. + Notes + ----- + Using this function is less memory intense than calling `scipy.linalg.qr(la[0])`, + because the memory used in `la[0]` is reclaimed earlier. This makes a difference when + decomposing very large arrays, where every memory copy counts. + Warnings -------- - Content of `la` gets destroyed in the process. + Content of `la` as well as `la[0]` gets destroyed in the process. Again, for memory-effiency reasons. """ a = np.asfortranarray(la[0]) @@ -1182,20 +1185,22 @@ def qr_destroy(la): class MmWriter(object): - """Store a corpus in Matrix Market format, used for :class:`~gensim.corpora.mmcorpus.MmCorpus`. + """Store a corpus in `Matrix Market format `_, + using :class:`~gensim.corpora.mmcorpus.MmCorpus`. Notes ----- - Output is written one document at a time, not the whole matrix at once (unlike `scipy.io.mmread`). - This allows us to process corpora which are larger than the available RAM. + The output is written one document at a time, not the whole matrix at once (unlike e.g. `scipy.io.mmread`). + This allows you to write corpora which are larger than the available RAM. The output file is created in a single pass through the input corpus, so that the input can be - a once-only stream (iterator). To achieve this, a fake MM header is written first, statistics are collected + a once-only stream (generator). + + To achieve this, a fake MM header is written first, corpus statistics are collected during the pass (shape of the matrix, number of non-zeroes), followed by a seek back to the beginning of the file, - rewriting the fake header with proper values. + rewriting the fake header with the final values. """ - HEADER_LINE = b'%%MatrixMarket matrix coordinate real general\n' # the only supported MM format def __init__(self, fname): @@ -1242,7 +1247,7 @@ def write_headers(self, num_docs, num_terms, num_nnz): self.headers_written = True def fake_headers(self, num_docs, num_terms, num_nnz): - """Write "fake" headers to file. + """Write "fake" headers to file, to be rewritten once we've scanned the entire corpus. Parameters ---------- @@ -1287,22 +1292,22 @@ def write_vector(self, docno, vector): @staticmethod def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, metadata=False): - """Save the corpus to disk in Matrix Market format. + """Save the corpus to disk in `Matrix Market format `_. Parameters ---------- fname : str Filename of the resulting file. corpus : iterable of list of (int, number) - Corpus in Bow format. + Corpus in streamed bag-of-words format. progress_cnt : int, optional Print progress for every `progress_cnt` number of documents. index : bool, optional - If True, the offsets will be return, otherwise return None. + Return offsets? num_terms : int, optional - If provided, the `num_terms` attributes in the corpus will be ignored. + Number of terms in the corpus. If provided, the `corpus.num_terms` attribute (if any) will be ignored. metadata : bool, optional - If True, a metadata file will be generated. + Generate a metadata file? Returns ------- @@ -1315,7 +1320,8 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, See Also -------- - :func:`~gensim.corpora.mmcorpus.MmCorpus.save_corpus` + :func:`gensim.corpora.mmcorpus.MmCorpus.save_corpus` + Save corpus to disk. """ mw = MmWriter(fname) @@ -1372,7 +1378,7 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, return offsets def __del__(self): - """Close `self.fout` file, alias for :meth:`~gensim.matutils.MmWriter.close`. + """Close `self.fout` file. Alias for :meth:`~gensim.matutils.MmWriter.close`. Warnings -------- @@ -1395,7 +1401,7 @@ def close(self): FAST_VERSION = -1 class MmReader(object): - """Matrix market file reader, used for :class:`~gensim.corpora.mmcorpus.MmCorpus`. + """Matrix market file reader, used internally in :class:`~gensim.corpora.mmcorpus.MmCorpus`. Wrap a term-document matrix on disk (in matrix-market format), and present it as an object which supports iteration over the rows (~documents). @@ -1403,30 +1409,29 @@ class MmReader(object): Attributes ---------- num_docs : int - number of documents in market matrix file + Number of documents in market matrix file. num_terms : int - number of terms + Number of terms. num_nnz : int - number of non-zero terms + Number of non-zero terms. Notes - ---------- + ----- Note that the file is read into memory one document at a time, not the whole matrix at once - (unlike :meth:`~scipy.io.mmread`). This allows us to process corpora which are larger than the available RAM. + (unlike e.g. `scipy.io.mmread` and other implementations). + This allows us to process corpora which are larger than the available RAM. """ - def __init__(self, input, transposed=True): """ Parameters ---------- input : {str, file-like object} - Path to input file in MM format or a file-like object that supports `seek()` - (e.g. :class:`~gzip.GzipFile`, :class:`~bz2.BZ2File`). - + Path to the input file in MM format or a file-like object that supports `seek()` + (e.g. smart_open objects). transposed : bool, optional - if True, expects lines to represent doc_id, term_id, value. Else, expects term_id, doc_id, value. + Do lines represent `doc_id, term_id, value`, instead of `term_id, doc_id, value`? """ logger.info("initializing corpus reader from %s", input) @@ -1457,7 +1462,7 @@ def __init__(self, input, transposed=True): ) def __len__(self): - """Get size of corpus (number of documents).""" + """Get the corpus size: total number of documents.""" return self.num_docs def __str__(self): @@ -1479,18 +1484,18 @@ def skip_headers(self, input_file): break def __iter__(self): - """Iterate through corpus. + """Iterate through all documents in the corpus. Notes ------ Note that the total number of vectors returned is always equal to the number of rows specified - in the header, empty documents are inserted and yielded where appropriate, even if they are not explicitly + in the header: empty documents are inserted and yielded where appropriate, even if they are not explicitly stored in the Matrix Market file. Yields ------ (int, list of (int, number)) - Document id and Document in BoW format + Document id and document in sparse bag-of-words format. """ with utils.file_or_filename(self.input) as lines: @@ -1530,17 +1535,17 @@ def __iter__(self): yield previd, [] def docbyoffset(self, offset): - """Get document at file offset `offset` (in bytes). + """Get the document at file offset `offset` (in bytes). Parameters ---------- offset : int - Offset, in bytes, of desired document. + File offset, in bytes, of the desired document. Returns ------ list of (int, str) - Document in BoW format. + Document in sparse bag-of-words format. """ # empty documents are not stored explicitly in MM format, so the index marks diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 5ff4e99051..ed6f4c44df 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -2,7 +2,15 @@ # -*- coding: utf-8 -*- # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""Automatically detect common phrases (multi-word expressions / bi-grams) from a stream of sentences. +"""Automatically detect common phrases -- multi-word expressions / word n-grams -- from a stream of sentences. + +Inspired by: + +* `Mikolov, et. al: "Distributed Representations of Words and Phrases and their Compositionality" + `_ +* `"Normalized (Pointwise) Mutual Information in Colocation Extraction" by Gerlof Bouma + `_ + Examples -------- @@ -25,6 +33,7 @@ ... pass """ + import sys import os import logging @@ -86,7 +95,6 @@ def _is_single(obj): class SentenceAnalyzer(object): """Base util class for :class:`~gensim.models.phrases.Phrases` and :class:`~gensim.models.phrases.Phraser`.""" - def score_item(self, worda, wordb, components, scorer): """Get bi-gram score statistics. @@ -119,16 +127,16 @@ def score_item(self, worda, wordb, components, scorer): return -1 def analyze_sentence(self, sentence, threshold, common_terms, scorer): - """Analyze a sentence. + """Analyze a sentence, detecting any bigrams that should be concatenated. Parameters ---------- - sentence : list of str - Token list representing the sentence to be analyzed. + sentence : iterable of str + Token sequence representing the sentence to be analyzed. threshold : float The minimum score for a bigram to be taken into account. common_terms : list of object - List of common terms, they have a special treatment. + List of common terms, they have special treatment. scorer : function Scorer function, as given to :class:`~gensim.models.phrases.Phrases`. See :func:`~gensim.models.phrases.npmi_scorer` and :func:`~gensim.models.phrases.original_scorer`. @@ -136,8 +144,8 @@ def analyze_sentence(self, sentence, threshold, common_terms, scorer): Yields ------ (str, score) - Tuple where first element is bi-gram, second is score (if bi-gram detected), - otherwise - first element is word and second is None. + If bi-gram detected, a tuple where the first element is a detect bigram, second its score. + Otherwise, the first tuple element is a single word and second is None. """ s = [utils.any2utf8(w) for w in sentence] @@ -265,21 +273,21 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, Notes ----- 'npmi' is more robust when dealing with common words that form part of common bigrams, and - ranges from -1 to 1, but is slower to calculate than the default. + ranges from -1 to 1, but is slower to calculate than the default. The default is the PMI-like scoring + as described by `Mikolov, et. al: "Distributed Representations of Words and Phrases and their Compositionality" + `_. - To use a custom scoring function, create a function with the following parameters and set the `scoring` - parameter to the custom function. You must use all the parameters in your function call, even if the - function does not require all the parameters. + To use a custom scoring function, pass in a function with the following signature: - * worda_count - number of occurrences in `sentences` of the first token in the phrase being scored - * wordb_count - number of occurrences in `sentences` of the second token in the phrase being scored - * bigram_count - number of occurrences in `sentences` of the phrase being scored + * worda_count - number of corpus occurrences in `sentences` of the first token in the bigram being scored + * wordb_count - number of corpus occurrences in `sentences` of the second token in the bigram being scored + * bigram_count - number of occurrences in `sentences` of the whole bigram * len_vocab - the number of unique tokens in `sentences` * min_count - the `min_count` setting of the Phrases class * corpus_word_count - the total number of tokens (non-unique) in `sentences` - A scoring function without any of these parameters (even if the parameters are not used) will - raise a ValueError on initialization of the Phrases class. The scoring function must be picklable. + The scoring function **must accept all these parameters**, even if it doesn't use them in its scoring. + The scoring function **must be pickleable**. """ if min_count <= 0: @@ -336,8 +344,8 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, @classmethod def load(cls, *args, **kwargs): - """Load a previously saved Phrases class. Handles backwards compatibility from older Phrases versions - which did not support pluggable scoring functions. + """Load a previously saved Phrases class. + Handles backwards compatibility from older Phrases versions which did not support pluggable scoring functions. Parameters ---------- @@ -493,21 +501,21 @@ def add_vocab(self, sentences): self.vocab = vocab def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): - """Get all phrases from given 'sentences'. + """Get all phrases that appear in 'sentences' that pass the bigram threshold. Parameters ---------- sentences : iterable of list of str Text corpus. out_delimiter : str, optional - Delimiter that will be used for "glue" words to phrase. + Delimiter used to "glue" together words that form a bigram phrase. as_tuples : bool, optional - If True - yield (tuple(words), score), otherwise - (out_delimiter.join(words), score). + Yield `(tuple(words), score)` instead of `(out_delimiter.join(words), score)`? Yields ------ ((str, str), float) **or** (str, float) - Phrases given from `sentences`, type depends on `as_tuples` parameter. + Phrases detected in `sentences`. Return type depends on the `as_tuples` parameter. Example ------- @@ -544,7 +552,7 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): yield (out_delimiter.join(words), score) def __getitem__(self, sentence): - """Convert the input tokens `sentence` into phrase tokens (where detected phrases are joined by delimiter). + """Convert the input tokens `sentence` into tokens where detected bigrams are joined by a selected delimiter. If `sentence` is an entire corpus (iterable of sentences rather than a single sentence), return an iterable that converts each of the corpus' sentences @@ -557,8 +565,9 @@ def __getitem__(self, sentence): Returns ------- - {list of str, :class:`gensim.iterfaces.TransformedCorpus`} - `sentences` with phrases, type depends on `sentence` type. + {list of str, :class:`gensim.interfaces.TransformedCorpus`} + `sentence` with detected phrase bigrams merged together, or a streamed corpus of such sentences + if the input was a corpus. Examples ---------- @@ -576,16 +585,13 @@ def __getitem__(self, sentence): >>> #Both of these tokens appear in corpus at least twice, and phrase score is higher, than treshold = 1: >>> print(phrases[sent]) [u'trees_graph', u'minors'] - - >>> from gensim.test.utils import datapath - >>> from gensim.models.word2vec import Text8Corpus - >>> from gensim.models.phrases import Phrases >>> >>> sentences = Text8Corpus(datapath('testcorpus.txt')) >>> phrases = Phrases(sentences, min_count=1, threshold=1) + >>> phraser = Phraser(phrases) # for speedup >>> >>> sent = [[u'trees', u'graph', u'minors'],[u'graph', u'minors']] - >>> for phrase in phrases[sent]: + >>> for phrase in phraser[sent]: ... pass """ @@ -621,8 +627,8 @@ def __getitem__(self, sentence): def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): - """Calculation score, based on original `"Efficient Estimaton of Word Representations in Vector Space" by - Mikolov `_. + """Bigram scoring function, based on the original `Mikolov, et. al: "Distributed Representations + of Words and Phrases and their Compositionality" `_. Parameters ---------- @@ -635,13 +641,13 @@ def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count len_vocab : int Size of vocabulary. min_count: int - Minimal score threshold. + Minimum score threshold. corpus_word_count : int - NOT USED. + Not used in this particular scoring technique. Notes ----- - Formula: :math:`\\frac{(worda\_count - min\_count) * len\_vocab }{ (worda\_count * wordb\_count)}`. + Formula: :math:`\\frac{(bigram\_count - min\_count) * len\_vocab }{ (worda\_count * wordb\_count)}`. """ return (bigram_count - min_count) / worda_count / wordb_count * len_vocab @@ -660,11 +666,11 @@ def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co bigram_count : int Number of co-occurrences for phrase "worda_wordb". len_vocab : int - NOT USED. + Not used. min_count: int - NOT USED. + Not used. corpus_word_count : int - Number of words in corpus. + Total number of words in the corpus. Notes ----- @@ -679,7 +685,7 @@ def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co def pseudocorpus(source_vocab, sep, common_terms=frozenset()): - """Feeds source_vocab's compound keys back to it, to discover phrases. + """Feeds `source_vocab`'s compound keys back to it, to discover phrases. Parameters ---------- @@ -712,7 +718,14 @@ def pseudocorpus(source_vocab, sep, common_terms=frozenset()): class Phraser(SentenceAnalyzer, PhrasesTransformation): - """Minimal state & functionality to apply results of a :class:`~gensim.models.phrases.Phrases`.""" + """Minimal state & functionality exported from :class:`~gensim.models.phrases.Phrases`. + + The goal of this class is to cut down memory consumption of `Phrases`, by discarding model state + not strictly needed for the bigram detection task. + + Use this instead of `Phrases` if you do not need to update the bigram statistics with new documents any more. + + """ def __init__(self, phrases_model): """ @@ -777,7 +790,7 @@ def pseudocorpus(self, phrases_model): return pseudocorpus(phrases_model.vocab, phrases_model.delimiter, phrases_model.common_terms) def score_item(self, worda, wordb, components, scorer): - """Score bigram. + """Score a bigram. Parameters ---------- @@ -802,17 +815,18 @@ def score_item(self, worda, wordb, components, scorer): return -1 def __getitem__(self, sentence): - """Convert the input tokens `sentence` into phrase tokens. + """Convert the input sequence of tokens `sentence` into a sequence of tokens where adjacent + tokens are replaced by a single token if they form a bigram collocation. Parameters ---------- sentence : {list of str, iterable of list of str} - Input sentence or sentences. + Input sentence or a stream of sentences. Return ------ {list of str, iterable of list of str} - Sentence or sentences with phrase tokens that joined by delimiter-character. + Sentence or sentences with phrase tokens joined by `self.delimiter` character. Examples ---------- diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 68e83d8e6f..ae24556835 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -5,6 +5,13 @@ # Copyright (C) 2017 Mohit Rathore # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +"""This module implements functionality related to the `Term Frequency - Inverse Document Frequency +` vector space bag-of-words models. + +For a more in-depth exposition of TF-IDF and its various SMART variants (normalization, weighting schemes), +see the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/ + +""" import logging from functools import partial @@ -18,7 +25,7 @@ def resolve_weights(smartirs): - """Checks for validity of `smartirs` parameter. + """Check the validity of `smartirs` parameters. Parameters ---------- @@ -32,20 +39,22 @@ def resolve_weights(smartirs): Returns ------- - w_tf : str - Term frequency weighing: + 3-tuple (local_letter, global_letter, normalization_letter) + + local_letter : str + Term frequency weighing, one of: * `n` - natural, * `l` - logarithm, * `a` - augmented, * `b` - boolean, * `L` - log average. - w_df : str - Document frequency weighting: + global_letter : str + Document frequency weighting, one of: * `n` - none, * `t` - idf, * `p` - prob idf. - w_n : str - Document normalization: + normalization_letter : str + Document normalization, one of: * `n` - none, * `c` - cosine. @@ -53,7 +62,7 @@ def resolve_weights(smartirs): ------ ValueError If `smartirs` is not a string of length 3 or one of the decomposed value - doesn't fit the list of permissible values + doesn't fit the list of permissible values. """ if not isinstance(smartirs, str) or len(smartirs) != 3: @@ -74,12 +83,12 @@ def resolve_weights(smartirs): def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0): - """Compute default inverse-document-frequency for a term with document frequency: - :math:`idf = add + log_{log\_base} \\frac{totaldocs}{doc\_freq}` + """Compute inverse-document-frequency for a term with the given document frequency `docfreq`: + :math:`idf = add + log_{log\_base} \\frac{totaldocs}{docfreq}` Parameters ---------- - docfreq : float + docfreq : {int, float} Document frequency. totaldocs : int Total number of documents. @@ -103,16 +112,17 @@ def precompute_idfs(wglobal, dfs, total_docs): Parameters ---------- wglobal : function - Custom function for calculation idf, look at "universal" :func:`~gensim.models.tfidfmodel.updated_wglobal`. + Custom function for calculating the "global" weighting function. + See for example the SMART alternatives under :func:`~gensim.models.tfidfmodel.smartirs_wglobal`. dfs : dict - Dictionary with term_id and how many documents this token appeared. + Dictionary mapping `term_id` into how many documents did that term appear in. total_docs : int - Total number of document. + Total number of documents. Returns ------- - dict - Precomputed idfs in format {term_id_1: idfs_1, term_id_2: idfs_2, ...} + dict of (int, float) + Inverse document frequencies in the format `{term_id_1: idfs_1, term_id_2: idfs_2, ...}`. """ # not strictly necessary and could be computed on the fly in TfidfModel__getitem__. @@ -120,36 +130,36 @@ def precompute_idfs(wglobal, dfs, total_docs): return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)} -def updated_wlocal(tf, n_tf): - """A scheme to transform `tf` or term frequency based on the value of `n_tf`. +def smartirs_wlocal(tf, local_scheme): + """Calculate local term weight for a term using the weighting scheme specified in `local_scheme`. Parameters ---------- tf : int Term frequency. - n_tf : {'n', 'l', 'a', 'b', 'L'} - Parameter to decide the current transformation scheme. + local : {'n', 'l', 'a', 'b', 'L'} + Local transformation scheme. Returns ------- float - Calculated wlocal. + Calculated local weight. """ - if n_tf == "n": + if local_scheme == "n": return tf - elif n_tf == "l": + elif local_scheme == "l": return 1 + np.log2(tf) - elif n_tf == "a": + elif local_scheme == "a": return 0.5 + (0.5 * tf / tf.max(axis=0)) - elif n_tf == "b": + elif local_scheme == "b": return tf.astype('bool').astype('int') - elif n_tf == "L": + elif local_scheme == "L": return (1 + np.log2(tf)) / (1 + np.log2(tf.mean(axis=0))) -def updated_wglobal(docfreq, totaldocs, n_df): - """A scheme to transform `docfreq` or document frequency based on the value of `n_df`. +def smartirs_wglobal(docfreq, totaldocs, global_scheme): + """Calculate global document weight based on the weighting scheme specified in `global_scheme`. Parameters ---------- @@ -157,56 +167,59 @@ def updated_wglobal(docfreq, totaldocs, n_df): Document frequency. totaldocs : int Total number of documents. - n_df : {'n', 't', 'p'} - Parameter to decide the current transformation scheme. + global_scheme : {'n', 't', 'p'} + Global transformation scheme. Returns ------- float - Calculated wglobal. + Calculated global weight. """ - if n_df == "n": + if global_scheme == "n": return 1. - elif n_df == "t": + elif global_scheme == "t": return np.log2(1.0 * totaldocs / docfreq) - elif n_df == "p": + elif global_scheme == "p": return max(0, np.log2((1.0 * totaldocs - docfreq) / docfreq)) -def updated_normalize(x, n_n, return_norm=False): - """Normalizes the final tf-idf value according to the value of `n_n`. +def smartirs_normalize(x, norm_scheme, return_norm=False): + """Normalize a vector using the normalization scheme specified in `norm_scheme`. Parameters ---------- x : numpy.ndarray Input array - n_n : {'n', 'c'} - Parameter that decides the normalizing function to be used. + norm_scheme : {'n', 'c'} + Normalizing function to use: + `n`: no normalization + `c`: unit L2 norm (scale `x` to unit euclidean length) return_norm : bool, optional - If True - returns the length of vector `x`. + Return the length of `x` as well? Returns ------- numpy.ndarray Normalized array. - float - Vector length. + float (only if return_norm is set) + L2 norm of `x`. """ - if n_n == "n": + if norm_scheme == "n": if return_norm: - return x, 1. + _, length = matutils.unitvec(x, return_norm=return_norm) + return x, length else: return x - elif n_n == "c": + elif norm_scheme == "c": return matutils.unitvec(x, return_norm=return_norm) class TfidfModel(interfaces.TransformationABC): """Objects of this class realize the transformation between word-document co-occurrence matrix (int) - into a locally/globally weighted TF_IDF matrix (positive floats). + into a locally/globally weighted TF-IDF matrix (positive floats). Examples -------- @@ -216,16 +229,15 @@ class TfidfModel(interfaces.TransformationABC): >>> >>> dataset = api.load("text8") >>> dct = Dictionary(dataset) # fit dictionary - >>> corpus = [dct.doc2bow(line) for line in dataset] # convert dataset to BoW format + >>> corpus = [dct.doc2bow(line) for line in dataset] # convert corpus to BoW format >>> >>> model = TfidfModel(corpus) # fit model - >>> vector = model[corpus[0]] # apply model + >>> vector = model[corpus[0]] # apply model to the first corpus document """ - def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity, wglobal=df2idf, normalize=True, smartirs=None, pivot=None, slope=0.65): - """Compute tf-idf by multiplying a local component (term frequency) with a global component + """Compute TF-IDF by multiplying a local component (term frequency) with a global component (inverse document frequency), and normalizing the resulting documents to unit length. Formula for non-normalized weight of term :math:`i` in document :math:`j` in a corpus of :math:`D` documents @@ -252,9 +264,7 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden wglobal : function, optional Function for global weighting, default is :func:`~gensim.models.tfidfmodel.df2idf`. normalize : bool, optional - It dictates how the final transformed vectors will be normalized. `normalize=True` means set to unit length - (default); `False` means don't normalize. You can also set `normalize` to your own function that accepts - and returns a sparse vector. + Normalize document vectors to unit euclidean length? You can also inject your own function into `normalize`. smartirs : str, optional SMART (System for the Mechanical Analysis and Retrieval of Text) Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting variants in the vector space model. @@ -280,19 +290,21 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden For more information visit `SMART Information Retrieval System `_. pivot : float, optional - It is the point around which the regular normalization curve is `tilted` to get the new pivoted + See the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/. + + Pivot is the point around which the regular normalization curve is `tilted` to get the new pivoted normalization curve. In the paper `Amit Singhal, Chris Buckley, Mandar Mitra: "Pivoted Document Length Normalization" `_ it is the point where the retrieval and relevance curves intersect. - This parameter along with slope is used for pivoted document length normalization. - Only when `pivot` is not None pivoted document length normalization will be applied else regular TfIdf - is used. + + This parameter along with `slope` is used for pivoted document length normalization. + Only when `pivot` is not None will pivoted document length normalization be applied. + Otherwise, regular TfIdf is used. slope : float, optional - It is the parameter required by pivoted document length normalization which determines the slope to which - the `old normalization` can be tilted. This parameter only works when pivot is defined by user and is not - None. - """ + Parameter required by pivoted document length normalization which determines the slope to which + the `old normalization` can be tilted. This parameter only works when pivot is defined. + """ self.id2word = id2word self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize self.num_docs, self.num_nnz, self.idfs = None, None, None @@ -304,13 +316,13 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden # If smartirs is not None, override wlocal, wglobal and normalize if smartirs is not None: n_tf, n_df, n_n = resolve_weights(smartirs) - self.wlocal = partial(updated_wlocal, n_tf=n_tf) - self.wglobal = partial(updated_wglobal, n_df=n_df) + self.wlocal = partial(smartirs_wlocal, local_scheme=n_tf) + self.wglobal = partial(smartirs_wglobal, global_scheme=n_df) # also return norm factor if pivot is not none if self.pivot is None: - self.normalize = partial(updated_normalize, n_n=n_n) + self.normalize = partial(smartirs_normalize, norm_scheme=n_n) else: - self.normalize = partial(updated_normalize, n_n=n_n, return_norm=True) + self.normalize = partial(smartirs_normalize, norm_scheme=n_n, return_norm=True) if dictionary is not None: # user supplied a Dictionary object, which already contains all the @@ -334,19 +346,19 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden @classmethod def load(cls, *args, **kwargs): - """ - Load a previously saved TfidfModel class. Handles backwards compatibility from - older TfidfModel versions which did not use pivoted document normalization. + """Load a previously saved TfidfModel class. Handles backwards compatibility from + older TfidfModel versions which did not use pivoted document normalization. + """ model = super(TfidfModel, cls).load(*args, **kwargs) if not hasattr(model, 'pivot'): - logger.info('older version of %s loaded without pivot arg', cls.__name__) - logger.info('Setting pivot to None.') model.pivot = None + logger.info('older version of %s loaded without pivot arg', cls.__name__) + logger.info('Setting pivot to %s.', model.pivot) if not hasattr(model, 'slope'): - logger.info('older version of %s loaded without slope arg', cls.__name__) - logger.info('Setting slope to 0.65.') model.slope = 0.65 + logger.info('older version of %s loaded without slope arg', cls.__name__) + logger.info('Setting slope to %s.', model.slope) return model def __str__(self): @@ -384,19 +396,21 @@ def initialize(self, corpus): self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) def __getitem__(self, bow, eps=1e-12): - """Get tf-idf representation of the input vector and/or corpus. + """Get the tf-idf representation of an input vector and/or corpus. bow : {list of (int, int), iterable of iterable of (int, int)} - Input document or copus in BoW format. + Input document in the `sparse Gensim bag-of-words format + `_, + or a streamed corpus of such documents. eps : float Threshold value, will remove all position that have tfidf-value less than `eps`. Returns ------- vector : list of (int, float) - TfIdf vector, if `bow` is document **OR** + TfIdf vector, if `bow` is a single document :class:`~gensim.interfaces.TransformedCorpus` - TfIdf corpus, if `bow` is corpus. + TfIdf corpus, if `bow` is a corpus. """ self.eps = eps diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py index 5dedc50b67..4e036d720e 100755 --- a/gensim/similarities/docsim.py +++ b/gensim/similarities/docsim.py @@ -4,9 +4,10 @@ # Copyright (C) 2013 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""Computing similarities across a collection of documents in the Vector Space Model. +"""Compute similarities across a collection of documents in the Vector Space Model. The main class is :class:`~gensim.similarities.docsim.Similarity`, which builds an index for a given set of documents. + Once the index is built, you can perform efficient queries like "Tell me how similar is this query document to each document in the index?". The result is a vector of numbers as large as the size of the initial set of documents, that is, one float for each index document. Alternatively, you can also request only the top-N most @@ -16,12 +17,13 @@ How It Works ------------ The :class:`~gensim.similarities.docsim.Similarity` class splits the index into several smaller sub-indexes ("shards"), -which are disk-based. If your entire index fits in memory (~hundreds of thousands documents for 1GB of RAM), +which are disk-based. If your entire index fits in memory (~one million documents per 1GB of RAM), you can also use the :class:`~gensim.similarities.docsim.MatrixSimilarity` or :class:`~gensim.similarities.docsim.SparseMatrixSimilarity` classes directly. -These are more simple but do not scale as well (they keep the entire index in RAM, no sharding). +These are more simple but do not scale as well: they keep the entire index in RAM, no sharding. They also do not +support adding new document to the index dynamically. -Once the index has been initialized, you can query for document similarity simply by: +Once the index has been initialized, you can query for document similarity simply by >>> from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile >>> @@ -31,7 +33,7 @@ >>> index = Similarity(index_tmpfile, common_corpus, num_features=len(common_dictionary)) # build the index >>> similarities = index[query] # get similarities between the query and all index documents -If you have more query documents, you can submit them all at once, in a batch: +If you have more query documents, you can submit them all at once, in a batch >>> from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile >>> @@ -42,12 +44,12 @@ >>> for similarities in index[batch_of_documents]: # the batch is simply an iterable of documents, aka gensim corpus. ... pass -The benefit of this batch (aka "chunked") querying is much better performance. +The benefit of this batch (aka "chunked") querying is a much better performance. To see the speed-up on your machine, run ``python -m gensim.test.simspeed`` (compare to my results `here `_). There is also a special syntax for when you need similarity of documents in the index -to the index itself (i.e. queries=indexed documents themselves). This special syntax +to the index itself (i.e. queries = the indexed documents themselves). This special syntax uses the faster, batch queries internally and **is ideal for all-vs-all pairwise similarities**: >>> from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile @@ -59,6 +61,7 @@ ... pass """ + import logging import itertools import os @@ -89,9 +92,9 @@ class Shard(utils.SaveLoad): :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`, etc, so that it mmaps from disk on request (query). """ - def __init__(self, fname, index): """ + Parameters ---------- fname : str @@ -212,10 +215,8 @@ def query_shard(args): Returns ------- - :class:`numpy.ndarray` - Similarities of document/corpus if index is :class:`~gensim.similarities.docsim.MatrixSimilarity` **or** - :class:`scipy.sparse.csr_matrix` - for case if index is :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`. + :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix` + Similarities of the query against documents indexed in this shard. """ query, shard = args # simulate starmap (not part of multiprocessing in older Pythons) @@ -226,15 +227,15 @@ def query_shard(args): class Similarity(interfaces.SimilarityABC): - """Compute cosine similarity of a dynamic query against a static corpus of documents ('the index'). + """Compute cosine similarity of a dynamic query against a corpus of documents ('the index'). + + The index supports adding new documents dynamically. Notes ----- Scalability is achieved by sharding the index into smaller pieces, each of which fits into core memory The shards themselves are simply stored as files to disk and mmap'ed back as needed. - - Examples -------- >>> from gensim.corpora.textcorpus import TextCorpus @@ -276,31 +277,34 @@ def __init__(self, output_prefix, corpus, num_features, num_best=None, chunksize Parameters ---------- output_prefix : str - Prefix for shard filename. If None - random filename in temp will be used. + Prefix for shard filename. If None, a random filename in temp will be used. corpus : iterable of list of (int, number) - Corpus in BoW format. + Corpus in streamed Gensim bag-of-words format. num_features : int Size of the dictionary (number of features). num_best : int, optional If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0. Otherwise, return a full vector with one float for every document in the index. chunksize : int, optional - Size of block. + Size of query chunks. Used internally when the query is an entire corpus. shardsize : int, optional - Size of shards should be chosen so that a `shardsize x chunksize` matrix of floats fits comfortably - into memory. + Maximum shard size, in documents. Choose a value so that a `shardsize x chunksize` matrix of floats fits + comfortably into your RAM. norm : {'l1', 'l2'}, optional Normalization to use. Notes - ------------ - Documents are split (internally, transparently) into shards of `shardsize` documents each, converted to matrix, - for faster BLAS calls. Each shard is stored to disk under `output_prefix.shard_number`. + ----- + Documents are split (internally, transparently) into shards of `shardsize` documents each, and each shard + converted to a matrix, for faster BLAS calls. Each shard is stored to disk under `output_prefix.shard_number`. + If you don't specify an output prefix, a random filename in temp will be used. - If your entire index fits in memory (~hundreds of thousands - documents for 1GB of RAM), you can also use the :class:`~gensim.similarities.docsim.MatrixSimilarity` - or :class:`~gensim.similarities.docsim.SparseMatrixSimilarity` classes directly. These are more simple - but do not scale as well (they keep the entire index in RAM, no sharding). + + If your entire index fits in memory (~1 million documents per 1GB of RAM), you can also use the + :class:`~gensim.similarities.docsim.MatrixSimilarity` or + :class:`~gensim.similarities.docsim.SparseMatrixSimilarity` classes directly. + These are more simple but do not scale as well (they keep the entire index in RAM, no sharding). + They also do not support adding new document dynamically. """ if output_prefix is None: @@ -427,7 +431,7 @@ def close_shard(self): self.fresh_docs, self.fresh_nnz = [], 0 def reopen_shard(self): - """Reopen incomplete shard.""" + """Reopen an incomplete shard.""" assert self.shards if self.fresh_docs: raise ValueError("cannot reopen a shard with fresh documents in index") @@ -441,18 +445,17 @@ def reopen_shard(self): logger.debug("reopen complete") def query_shards(self, query): - """Applying shard[query] for each shard in `self.shards`, as a sequence. + """Apply shard[query] to each shard in `self.shards`. Used internally. Parameters ---------- query : {iterable of list of (int, number) , list of (int, number))} Document in BoW format or corpus of documents. - Returns ------- - (None, list of ...) - Result of search. + (None, list of individual shard query results) + Query results. """ args = zip([query] * len(self.shards), self.shards) @@ -467,19 +470,17 @@ def query_shards(self, query): return pool, result def __getitem__(self, query): - """Get similarities of document (or corpus) `query` to all documents in the corpus. + """Get similarities of the document (or corpus) `query` to all documents in the corpus. Parameters ---------- query : {iterable of list of (int, number) , list of (int, number))} - Corpus or document of corpus. + A single document in bag-of-words format, or a corpus (iterable) of such documents. Return ------ - :class:`numpy.ndarray` - Similarities of document/corpus if index is :class:`~gensim.similarities.docsim.MatrixSimilarity` **or** - :class:`scipy.sparse.csr_matrix` - for case if index is :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`. + :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix` + Similarities of the query against this index. Notes ----- @@ -496,7 +497,7 @@ def __getitem__(self, query): >>> >>> corpus = TextCorpus(datapath('testcorpus.txt')) >>> index = Similarity('temp', corpus, num_features=400) - >>> result = index[corpus] # similarities matrix + >>> result = index[corpus] # pairwise similarities of each document against each document """ self.close_shard() # no-op if no documents added to index since last query @@ -547,7 +548,7 @@ def convert(shard_no, doc): return result def vector_by_id(self, docpos): - """Get indexed vector corresponding to the document at position `docpos`. + """Get the indexed vector corresponding to the document at position `docpos`. Parameters ---------- @@ -557,7 +558,7 @@ def vector_by_id(self, docpos): Return ------ :class:`scipy.sparse.csr_matrix` - Indexed vector, internal type depends on underlying index. + Indexed vector. Examples -------- @@ -584,19 +585,17 @@ def vector_by_id(self, docpos): return result def similarity_by_id(self, docpos): - """Get similarity of the given document only by `docpos`. + """Get similarity of a document specified by its index position `docpos`. Parameters ---------- docpos : int - Document position in index + Document position in the index. Return ------ - :class:`numpy.ndarray` - Similarities of document/corpus if index is :class:`~gensim.similarities.docsim.MatrixSimilarity` **or** - :class:`scipy.sparse.csr_matrix` - for case if index is :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`. + :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix` + Similarities of the given document against this index. Examples -------- @@ -617,14 +616,12 @@ def similarity_by_id(self, docpos): def __iter__(self): """For each index document in index, compute cosine similarity against all other documents in the index. - Using :meth:`~gensim.similarities.docsim.Similarity.iter_chunks`. + Uses :meth:`~gensim.similarities.docsim.Similarity.iter_chunks` internally. Yields ------ - :class:`numpy.ndarray` - Similarities of document if index is :class:`~gensim.similarities.docsim.MatrixSimilarity` **or** - :class:`scipy.sparse.csr_matrix` - for case if index is :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`. + :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix` + Similarities of each document in turn against the index. """ # turn off query normalization (vectors in the index are already normalized, save some CPU) @@ -640,24 +637,18 @@ def __iter__(self): self.norm = norm # restore normalization def iter_chunks(self, chunksize=None): - """Iteratively yield the index as chunks of documents, each of size <= chunksize. + """Iteratively yield the index as chunks of document vectors, each of size <= chunksize. Parameters ---------- chunksize : int, optional Size of chunk,, if None - `self.chunksize` will be used. - Notes - ----- - The chunk is returned in its raw form. - The size of the chunk may be smaller than requested; it is up to the caller to check the result for real length. - Yields ------ - :class:`numpy.ndarray` - Similarities of document if index is :class:`~gensim.similarities.docsim.MatrixSimilarity` **or** - :class:`scipy.sparse.csr_matrix` - for case if index is :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`. + :class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix` + Chunks of the index as 2D arrays. The arrays are either dense or sparse, depending on + whether the shard was storing dense or sparse vectors. """ self.close_shard() @@ -677,26 +668,27 @@ def iter_chunks(self, chunksize=None): yield chunk def check_moved(self): - """Update shard locations (for case if the server directory has moved on filesystem).""" + """Update shard locations, for case where the server prefix location changed on the filesystem.""" dirname = os.path.dirname(self.output_prefix) for shard in self.shards: shard.dirname = dirname def save(self, fname=None, *args, **kwargs): - """Save the object via pickling (also see load) under filename specified in the constructor. + """Save the index object via pickling under `fname`. See also :meth:`~gensim.docsim.Similarity.load()`. Parameters ---------- fname : str, optional Path for save index, if not provided - will be saved to `self.output_prefix`. *args : object - Arguments, look at :meth:`gensim.interfaces.SimilarityABC.save`. + Arguments, see :meth:`gensim.utils.SaveLoad.save`. **kwargs : object - Keyword arguments, look at :meth:`gensim.interfaces.SimilarityABC.save`. + Keyword arguments, see :meth:`gensim.utils.SaveLoad.save`. Notes ----- - Call :meth:`~gensim.similarities.Similarity.close_shard` internally to spill unfinished shards to disk first. + Will call :meth:`~gensim.similarities.Similarity.close_shard` internally to spill + any unfinished shards to disk first. Examples -------- @@ -708,7 +700,7 @@ def save(self, fname=None, *args, **kwargs): >>> output_fname = get_tmpfile("saved_index") >>> >>> corpus = TextCorpus(datapath('testcorpus.txt')) - >>> index = Similarity(temp_fname, corpus, num_features=400) + >>> index = Similarity(output_fname, corpus, num_features=400) >>> >>> index.save(output_fname) >>> loaded_index = index.load(output_fname) @@ -720,7 +712,7 @@ def save(self, fname=None, *args, **kwargs): super(Similarity, self).save(fname, *args, **kwargs) def destroy(self): - """Delete all files under self.output_prefix, object is not usable after calling this method anymore.""" + """Delete all files under self.output_prefix Index is not usable anymore after calling this method.""" import glob for fname in glob.glob(self.output_prefix + '*'): logger.info("deleting %s", fname) @@ -748,18 +740,18 @@ def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None Parameters ---------- corpus : iterable of list of (int, number) - Corpus in BoW format. + Corpus in streamed Gensim bag-of-words format. num_best : int, optional If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0. Otherwise, return a full vector with one float for every document in the index. - dtype : numpy.dtype - Datatype of internal matrix - num_features : int, optional - Size of the dictionary. - chunksize : int, optional - Size of chunk. + num_features : int + Size of the dictionary (number of features). corpus_len : int, optional - Size of `corpus`, if not specified - will scan corpus to determine size. + Number of documents in `corpus`. If not specified, will scan the corpus to determine the matrix size. + chunksize : int, optional + Size of query chunks. Used internally when the query is an entire corpus. + dtype : numpy.dtype, optional + Datatype to store the internal matrix in. """ if num_features is None: @@ -804,7 +796,7 @@ def __len__(self): return self.index.shape[0] def get_similarities(self, query): - """Get similarity between `query` and current index instance. + """Get similarity between `query` and this index. Warnings -------- @@ -914,11 +906,11 @@ def __len__(self): return len(self.corpus) def get_similarities(self, query): - """Get similarity between `query` and current index instance. + """Get similarity between `query` and this index. Warnings -------- - Do not use this function directly; use the self[query] syntax instead. + Do not use this function directly; use the `self[query]` syntax instead. Parameters ---------- @@ -964,16 +956,18 @@ def __str__(self): class WmdSimilarity(interfaces.SimilarityABC): """Compute negative WMD similarity against a corpus of documents by storing the index matrix in memory. - See :class:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors` for more information. Also, tutorial `notebook `_ for more examples. When using this code, please consider citing the following papers: - `Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching" - `_, `Ofir Pele and Michael Werman, "Fast and robust earth - mover's distances" `_, `"Matt Kusner et al. "From Word - Embeddings To Document Distances" `_. + + * `Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching" + `_ + * `Ofir Pele and Michael Werman, "Fast and robust earth mover's distances" + `_ + * `Matt Kusner et al. "From Word Embeddings To Document Distances" + `_ Example ------- @@ -992,7 +986,6 @@ class WmdSimilarity(interfaces.SimilarityABC): >>> sims = index[query] """ - def __init__(self, corpus, w2v_model, num_best=None, normalize_w2v_and_replace=True, chunksize=256): """ @@ -1030,11 +1023,11 @@ def __len__(self): return len(self.corpus) def get_similarities(self, query): - """Get similarity between `query` and current index instance. + """Get similarity between `query` and this index. Warnings -------- - Do not use this function directly; use the self[query] syntax instead. + Do not use this function directly; use the `self[query]` syntax instead. Parameters ---------- @@ -1082,7 +1075,7 @@ class SparseMatrixSimilarity(interfaces.SimilarityABC): Notes ----- - Use this if your input corpus contains sparse vectors (such as documents in bag-of-words format) and fits into RAM. + Use this if your input corpus contains sparse vectors (such as TF-IDF documents) and fits into RAM. The matrix is internally stored as a :class:`scipy.sparse.csr_matrix` matrix. Unless the entire matrix fits into main memory, use :class:`~gensim.similarities.docsim.Similarity` instead. @@ -1099,35 +1092,33 @@ class SparseMatrixSimilarity(interfaces.SimilarityABC): Index similarity (dense with cosine distance). """ - def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None, num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False): """ + Parameters ---------- corpus: iterable of list of (int, float) A list of documents in the BoW format. num_features : int, optional - Size of the dictionary. + Size of the dictionary. Must be either specified, or present in `corpus.num_terms`. num_terms : int, optional - Number of terms, **must be specified**. + Alias for `num_features`, you can use either. num_docs : int, optional - Number of documents in `corpus`. + Number of documents in `corpus`. Will be calculated if not provided. num_nnz : int, optional - Number of non-zero terms. + Number of non-zero elements in `corpus`. Will be calculated if not provided. num_best : int, optional If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0. Otherwise, return a full vector with one float for every document in the index. chunksize : int, optional - Size of chunk. + Size of query chunks. Used internally when the query is an entire corpus. dtype : numpy.dtype, optional - Data type of internal matrix. + Data type of the internal matrix. maintain_sparsity : bool, optional - if True - will return sparse arr from - :meth:`~gensim.similarities.docsim.SparseMatrixSimilarity.get_similarities`. + Return sparse arrays from :meth:`~gensim.similarities.docsim.SparseMatrixSimilarity.get_similarities`? """ - self.num_best = num_best self.normalize = True self.chunksize = chunksize @@ -1168,11 +1159,11 @@ def __len__(self): return self.index.shape[0] def get_similarities(self, query): - """Get similarity between `query` and current index instance. + """Get similarity between `query` and this index. Warnings -------- - Do not use this function directly; use the self[query] syntax instead. + Do not use this function directly; use the `self[query]` syntax instead. Parameters ---------- diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py index 7579a09cc9..8c32ab4dd0 100644 --- a/gensim/sklearn_api/phrases.py +++ b/gensim/sklearn_api/phrases.py @@ -35,9 +35,10 @@ class PhrasesTransformer(TransformerMixin, BaseEstimator): """Base Phrases module, wraps :class:`~gensim.models.phrases.Phrases`. - For more information, please have a look to `Mikolov, et. al: "Efficient Estimation of Word Representations in - Vector Space" `_ and `Gerlof Bouma: "Normalized (Pointwise) Mutual Information - in Collocation Extraction" `_. + For more information, please have a look to `Mikolov, et. al: "Distributed Representations + of Words and Phrases and their Compositionality" `_ and + `Gerlof Bouma: "Normalized (Pointwise) Mutual Information in Collocation Extraction" + `_. """ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, @@ -63,8 +64,8 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, or with a function with the expected parameter names. Two built-in scoring functions are available by setting `scoring` to a string: - * 'default': Explained in `Mikolov, et. al: "Efficient Estimation of Word Representations - in Vector Space" `_. + * 'default': `Mikolov, et. al: "Distributed Representations of Words and Phrases + and their Compositionality" `_. * 'npmi': Explained in `Gerlof Bouma: "Normalized (Pointwise) Mutual Information in Collocation Extraction" `_. diff --git a/gensim/test/test_data/tfidf_model.tst b/gensim/test/test_data/tfidf_model.tst index 8d3c60c73ed35f9d341a1a9e65fd490301539739..b6448f0e388e9f0e1410d6c4b718631b65b7f9fe 100644 GIT binary patch literal 909 zcma))%Wl&^6o%tCiJeJXTA*CZ{pJ8Q&=m_%*;QhM7ZtN>WI5wWY{|Fdu@j|IB)R|| z0kLAibMOZ67OZ#!R&YEe$S6Xpt>ZKP=J!7%&;O>$$0CtoY{#iDB54<+&>yKWd0h=( zRhWR*KZ52Qbc=P9G>^R~JQUF4qcY(|nnsdjUS5PAD%<}zlyN1>rTf={Kl-s&H!0(6CZRWAE!?Gql{u_hw80g( zd72A5E+gDho_jMQxfgjkfwkj#;uo__5IDt}qo=e%wfFn5Zc&|S&m6~j`3o=gpbcE! z48DB&tQ}}C-tEH%)q=HP{U3IkHka5LsxPs#w6(-GX}ivT?&2EfXs6E9y634;XG(X0 znsugh7iqW7lsT$UHjX;j4eTcNE_Q;| zqs@{L*A2%Q0)f`I2FwWkooSl;aNA+#B0X@2=}E!iE^BAuBrV{c!;I-TN{3zqR&bL1 z{ZSL>WBTq}K_AoifUO>TEb{k|X@$yPXBwrLAJ{&bw-OVu(fW}U*l7Kjtthx~e!{dw QIb)6`533yS49cPX8!<~ZasU7T literal 1261 zcma)++fvg&7{`00ZHfm}JRlxXDJoU)gbIR3l$EM0YE)`WleCGXN&ndt24RWi_OX%*>ah!27!)7+S+5PQz-n>m24a;Lrv)uGe%Vp)T?wEBsSLVcY zRxB$>4822g6{(V%OC%CryQz8J3eu!sP4jQF&X8F}wnP#nMUuRWC(V0?_bi`v@iE88 zJRb{uRCq6vR9nsR<1J^=57AXo75<@VPQ&v9%Vfw^kyi^spC2@J*IBVpFzRj32z}pW zN=pwyN9V>V@jkR$rXE_R_E&;};u^Xswv_DR&?>e{`#sIWHZ{pL8Q9J$A$n@kPsbcx z@&gk)DiX6=3_GVM@ZsaT;a@#!ue3I>jCR1BMN4D)qUHKJH+)wWy=q==^H8;x8Fp8( zN3Mv_nZCiWmlULqrH1R*bQk;VO}Np474)rPKSh7Z?&D4zP%|P(npwv|=|zY`!i&Q- zxxur@aAXxnd3p?KPubuBE;rjQk6#ez%L+4eR}T~nZ0sv$xZJW798*(VaD5ENd190A zae`v7MAA~nI7*dDW#%_;@z~#?C+Cw>IHh*+ z6;}@(Zv>|!3{#XNoS`^NlJgupyGIY14z0~&}wqyhBQFhLAJl8P#*O$h|hf?-G^7{Z82 zcemlS@CS8rgM!BnOOSytq%g@1U4knK#s!x;_6H%;r(<(UoY&D++iu{MEIkx3w{i7+ z;2eMqD&Fm8K}+Hw@dICjT4x&0=WQ*c2a(+a(Su18(aMgFbgp(PN!Sfl?%y^i900^M zU`DXhJJ)L+so2Z`EE1A`-w2@^g7hwttAW6hBQNnOh zqT`LNN>Y^Fb~1pBz)h)2Xdyg<@5HiP1_D4$B@~e{bf!uiYdo>uzX7NW2VrBmwKS!8 zgbR){ietshigCzcu&#E{WzBC|6v(>#AGg02dXD$Duk^uI#5zmM^EBvXY6 I4AExr0H;F`zyJUM literal 822 zcmV-61IheCT4*^jL0KkKS?3V!e*gkE|NZ~}|NsC0|NsC0|NsC0|FUC%0HBBf03sL$ z1P}lKzyg0iguRHi~%x zA?h?~p^>#T$)GeeXaE2MLn8zLWMmp-4Ff_df0U{Enw!-F)Mx+z0004?0000001W^% zGy$LiibXU44FDPd27mwn00000001-q02vN9)ipFWKm-7iISr==DeZtoswe^=J%_3~ z0RS^3l!ka3U4JX`ly^4FHH~1{4^;n1ulaz#yUm2to+vQ`CO>VQBO|C{`Rn zq9aWt13)4lAQ*^k7SSvuAQ2(WHCu9SoxE++w~_6_5)ed%=@kG3Twol|0PI~Tj-*5$ zlMmPoWXJ&nODMo6Fv5Zi(NhK|2g$uk8Y|o;qx2&$j#+4HolcJ6e|@L> zEci%H0Fs%3jsPh@EGcs@R|P;?sw=X(QioL0LLqT)Zt(DlvtElFONUOqFB#Q<%9R{; zc~@lSWmc(IF<(m(H7mx2T$n@_LGDDT+fw~sVY+CNk)Z_`4blQb&p_qoS&#=16rmCi zF-pIG%^>Zq#F%MFkdunWtVw-f@;v07l7RP>5jon!nCQG=of}JuSFP+^e>m2|?WGZYN zcQ9L36s5KTHcYg29Jm>r(Ba-|HnR|(nMqn>o>M$}tU5*-Bg5ix#);VdCLVr%lTXo7 zsio?>SF-nC3*~((H#1* # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""This module contains various general utility functions.""" +"""Various general utility functions.""" from __future__ import with_statement from contextlib import contextmanager @@ -73,11 +73,8 @@ def get_random_state(seed): Notes ----- - Method originally from [1]_ and written by @joshloyal. - - References - ---------- - .. [1] https://github.com/maciejkula/glove-python + Method originally from `maciejkula/glove-python `_ + and written by `@joshloyal `_. """ if seed is None or seed is np.random: @@ -94,11 +91,7 @@ def synchronous(tlockname): Notes ----- - Adapted from [2]_ - - References - ---------- - .. [2] http://code.activestate.com/recipes/577105-synchronization-decorator-for-class-methods/ + Adapted from http://code.activestate.com/recipes/577105-synchronization-decorator-for-class-methods/. """ def _synched(func): @@ -117,7 +110,7 @@ def _synchronizer(self, *args, **kwargs): def file_or_filename(input): - """Open file with `smart_open`. + """Open a filename for reading with `smart_open`, or seek to the beginning if `input` is an already open file. Parameters ---------- @@ -126,8 +119,8 @@ def file_or_filename(input): Returns ------- - input : file-like object - Opened file OR seek out to 0 byte if `input` is already file-like object. + file-like object + An open file, positioned at the beginning. """ if isinstance(input, string_types): @@ -141,7 +134,7 @@ def file_or_filename(input): @contextmanager def open_file(input): - """Provide "with-like" behaviour except closing the file object. + """Provide "with-like" behaviour without closing the file object. Parameters ---------- @@ -170,7 +163,7 @@ def open_file(input): def deaccent(text): - """Remove accentuation from the given string. + """Remove letter accents from the given string. Parameters ---------- @@ -180,7 +173,7 @@ def deaccent(text): Returns ------- str - Unicode string without accentuation. + Unicode string without accents. Examples -------- @@ -221,25 +214,24 @@ def copytree_hardlink(source, dest): def tokenize(text, lowercase=False, deacc=False, encoding='utf8', errors="strict", to_lower=False, lower=False): - """Iteratively yield tokens as unicode strings, removing accent marks and optionally lowercasing string - if any from `lowercase`, `to_lower`, `lower` set to True. + """Iteratively yield tokens as unicode strings, optionally removing accent marks and lowercasing it. Parameters ---------- - text : str + text : str or bytes Input string. - lowercase : bool, optional - If True - lowercase input string. deacc : bool, optional - If True - remove accentuation from string by :func:`~gensim.utils.deaccent`. + Remove accentuation using :func:`~gensim.utils.deaccent`? encoding : str, optional Encoding of input string, used as parameter for :func:`~gensim.utils.to_unicode`. errors : str, optional Error handling behaviour, used as parameter for :func:`~gensim.utils.to_unicode`. + lowercase : bool, optional + Lowercase the input string? to_lower : bool, optional - Same as `lowercase`. + Same as `lowercase`. Convenience alias. lower : bool, optional - Same as `lowercase`. + Same as `lowercase`. Convenience alias. Yields ------ @@ -281,19 +273,20 @@ def simple_tokenize(text): def simple_preprocess(doc, deacc=False, min_len=2, max_len=15): - """Convert a document into a list of tokens (also with lowercase and optional de-accents), - used :func:`~gensim.utils.tokenize`. + """Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long. + + Uses :func:`~gensim.utils.tokenize` internally. Parameters ---------- doc : str Input document. deacc : bool, optional - If True - remove accentuation from string by :func:`~gensim.utils.deaccent`. + Remove accent marks from tokens using :func:`~gensim.utils.deaccent`? min_len : int, optional - Minimal length of token in result (inclusive). + Minimum length of token (inclusive). Shorter tokens are discarded. max_len : int, optional - Maximal length of token in result (inclusive). + Maximum length of token in result (inclusive). Longer tokens are discarded. Returns ------- @@ -309,16 +302,16 @@ def simple_preprocess(doc, deacc=False, min_len=2, max_len=15): def any2utf8(text, errors='strict', encoding='utf8'): - """Convert `text` to bytestring in utf8. + """Convert a unicode or bytes string in the given encoding into a utf8 bytestring. Parameters ---------- text : str Input text. errors : str, optional - Error handling behaviour, used as parameter for `unicode` function (python2 only). + Error handling behaviour if `text` is a bytestring. encoding : str, optional - Encoding of `text` for `unicode` function (python2 only). + Encoding of `text` if it is a bytestring. Returns ------- @@ -337,16 +330,16 @@ def any2utf8(text, errors='strict', encoding='utf8'): def any2unicode(text, encoding='utf8', errors='strict'): - """Convert `text` to unicode. + """Convert `text` (bytestring in given encoding or unicode) to unicode. Parameters ---------- text : str Input text. errors : str, optional - Error handling behaviour, used as parameter for `unicode` function (python2 only). + Error handling behaviour if `text` is a bytestring. encoding : str, optional - Encoding of `text` for `unicode` function (python2 only). + Encoding of `text` if it is a bytestring. Returns ------- @@ -363,7 +356,7 @@ def any2unicode(text, encoding='utf8', errors='strict'): def call_on_class_only(*args, **kwargs): - """Helper for raise `AttributeError` if method should be called from instance. + """Helper to raise `AttributeError` if a class method is called on an instance. Used internally. Parameters ---------- @@ -375,24 +368,24 @@ def call_on_class_only(*args, **kwargs): Raises ------ AttributeError - If `load` method are called on instance. + If a class method is called on an instance. """ raise AttributeError('This method should be called on a class object.') class SaveLoad(object): - """Class which inherit from this class have save/load functions, which un/pickle them to disk. + """Serialize/deserialize object from disk, by equipping objects with the save()/load() methods. Warnings -------- - This uses pickle for de/serializing, so objects must not contain unpicklable attributes, + This uses pickle internally (among other techniques), so objects must not contain unpicklable attributes such as lambda functions etc. """ @classmethod def load(cls, fname, mmap=None): - """Load a previously saved object (using :meth:`~gensim.utils.SaveLoad.save`) from file. + """Load an object previously saved using :meth:`~gensim.utils.SaveLoad.save` from a file. Parameters ---------- @@ -406,6 +399,7 @@ def load(cls, fname, mmap=None): See Also -------- :meth:`~gensim.utils.SaveLoad.save` + Save object to file. Returns ------- @@ -414,8 +408,8 @@ def load(cls, fname, mmap=None): Raises ------ - IOError - When methods are called on instance (should be called from class). + AttributeError + When called on an object instance instead of class (this is a class method). """ logger.info("loading %s object from %s", cls.__name__, fname) @@ -428,20 +422,20 @@ def load(cls, fname, mmap=None): return obj def _load_specials(self, fname, mmap, compress, subname): - """Loads any attributes that were stored specially, and gives the same opportunity - to recursively included :class:`~gensim.utils.SaveLoad` instances. + """Load attributes that were stored separately, and give them the same opportunity + to recursively load using the :class:`~gensim.utils.SaveLoad` interface. Parameters ---------- fname : str - Path to file that contains needed object. - mmap : str - Memory-map option. + Input file path. + mmap : {None, ‘r+’, ‘r’, ‘w+’, ‘c’} + Memory-map options. See `numpy.load(mmap_mode) + `_. compress : bool - Set to True if file is compressed. + Is the input file compressed? subname : str - ... - + Attribute name. Set automatically during recursive processing. """ def mmap_error(obj, filename): @@ -492,7 +486,7 @@ def mmap_error(obj, filename): @staticmethod def _adapt_by_suffix(fname): - """Give appropriate compress setting and filename formula. + """Get compress setting and filename for numpy file compression. Parameters ---------- @@ -509,7 +503,7 @@ def _adapt_by_suffix(fname): return compress, lambda *args: '.'.join(args + (suffix,)) def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2): - """Save the object to file. + """Save the object to a file. Used internally by :meth:`gensim.utils.SaveLoad.save()`. Parameters ---------- @@ -526,18 +520,12 @@ def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=fro Notes ----- - If `separately` is None, automatically detect large - numpy/scipy.sparse arrays in the object being stored, and store - them into separate files. This avoids pickle memory errors and - allows mmap'ing large arrays back on load efficiently. - - You can also set `separately` manually, in which case it must be - a list of attribute names to be stored in separate files. The - automatic check is not performed in this case. + If `separately` is None, automatically detect large numpy/scipy.sparse arrays in the object being stored, + and store them into separate files. This avoids pickle memory errors and allows mmap'ing large arrays back + on load efficiently. - See Also - -------- - :meth:`~gensim.utils.SaveLoad.load` + You can also set `separately` manually, in which case it must be a list of attribute names to be stored + in separate files. The automatic check is not performed in this case. """ logger.info("saving %s object under %s, separately %s", self.__class__.__name__, fname, separately) @@ -564,11 +552,11 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, fname : str Output filename. separately : list or None - Iterable of attributes than need to store distinctly + List of attributes to store separately. sep_limit : int - Limit for separation. + Don't store arrays smaller than this separately. In bytes. ignore : iterable of str - Attributes that shouldn't be store. + Attributes that shouldn't be stored at all. pickle_protocol : int Protocol number for pickle. compress : bool @@ -659,7 +647,7 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, return restores + [(self, asides)] def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2): - """Save the object to file. + """Save the object to a file. Parameters ---------- @@ -667,21 +655,24 @@ def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore= Path to output file or already opened file-like object. If the object is a file handle, no special array handling will be performed, all attributes will be saved to the same file. separately : list of str or None, optional - If None - automatically detect large numpy/scipy.sparse arrays in the object being stored, and store - them into separate files. This avoids pickle memory errors and allows mmap'ing large arrays - back on load efficiently. - If list of str - this attributes will be stored in separate files, the automatic check + If None, automatically detect large numpy/scipy.sparse arrays in the object being stored, and store + them into separate files. This prevent memory errors for large objects, and also allows + `memory-mapping `_ the large arrays for efficient + loading and sharing the large arrays in RAM between multiple processes. + + If list of str: store these attributes into separate files. The automated size check is not performed in this case. - sep_limit : int - Limit for automatic separation. - ignore : frozenset of str - Attributes that shouldn't be serialize/store. - pickle_protocol : int + sep_limit : int, optional + Don't store arrays smaller than this separately. In bytes. + ignore : frozenset of str, optional + Attributes that shouldn't be stored at all. + pickle_protocol : int, optional Protocol number for pickle. See Also -------- :meth:`~gensim.utils.SaveLoad.load` + Load object from file. """ try: @@ -713,7 +704,7 @@ def get_max_id(corpus): Parameters ---------- - corpus : iterable of iterable of (int, int) + corpus : iterable of iterable of (int, numeric) Collection of texts in BoW format. Returns @@ -739,7 +730,6 @@ class FakeDict(object): This is meant to avoid allocating real dictionaries when `num_terms` is huge, which is a waste of memory. """ - def __init__(self, num_terms): """ @@ -781,8 +771,8 @@ def keys(self): list of int Highest id, packed in list. - Warnings - -------- + Notes + ----- To avoid materializing the whole `range(0, self.num_terms)`, this returns the highest id = `[self.num_terms - 1]` only. @@ -804,7 +794,7 @@ def dict_from_corpus(corpus): Parameters ---------- - corpus : iterable of iterable of (int, int) + corpus : iterable of iterable of (int, numeric) Collection of texts in BoW format. Returns @@ -825,17 +815,25 @@ def dict_from_corpus(corpus): def is_corpus(obj): - """Check whether `obj` is a corpus. + """Check whether `obj` is a corpus, by peeking at its first element. Works even on streamed generators. + The peeked element is put back into a object returned by this function, so always use + that returned object instead of the original `obj`. Parameters ---------- obj : object - Something `iterable of iterable` that contains (int, int). + An `iterable of iterable` that contains (int, numeric). - Return - ------ + Returns + ------- (bool, object) - Pair of (is_corpus, `obj`), is_corpus True if `obj` is corpus. + Pair of (is `obj` a corpus, `obj` with peeked element restored) + + Examples + -------- + >>> from gensim.utils import is_corpus + >>> corpus = [[(1, 1.0)], [(2, -0.3), (3, 0.12)]] + >>> corpus_or_not, corpus = is_corpus(corpus) Warnings -------- @@ -918,13 +916,12 @@ class RepeatCorpus(SaveLoad): [[(1, 2)], [], [(1, 2)], [], [(1, 2)]] """ - def __init__(self, corpus, reps): """ Parameters ---------- - corpus : iterable of iterable of (int, int) + corpus : iterable of iterable of (int, numeric) Input corpus. reps : int Number of repeats for documents from corpus. @@ -949,13 +946,12 @@ class RepeatCorpusNTimes(SaveLoad): [[(1, 0.5)], [], [(1, 0.5)], [], [(1, 0.5)], []] """ - def __init__(self, corpus, n): """ Parameters ---------- - corpus : iterable of iterable of (int, int) + corpus : iterable of iterable of (int, numeric) Input corpus. n : int Number of repeats for corpus. @@ -971,17 +967,16 @@ def __iter__(self): class ClippedCorpus(SaveLoad): - """Wrap a `corpus` and return `max_doc` element from it""" - + """Wrap a `corpus` and return `max_doc` element from it.""" def __init__(self, corpus, max_docs=None): """ Parameters ---------- - corpus : iterable of iterable of (int, int) + corpus : iterable of iterable of (int, numeric) Input corpus. max_docs : int - Maximal number of documents in result corpus. + Maximum number of documents in the wrapped corpus. Warnings -------- @@ -1000,17 +995,16 @@ def __len__(self): class SlicedCorpus(SaveLoad): - """Wrap `corpus` and return the slice of it""" - + """Wrap `corpus` and return a slice of it.""" def __init__(self, corpus, slice_): """ Parameters ---------- - corpus : iterable of iterable of (int, int) + corpus : iterable of iterable of (int, numeric) Input corpus. slice_ : slice or iterable - Slice for `corpus` + Slice for `corpus`. Notes ----- @@ -1046,7 +1040,8 @@ def __len__(self): def safe_unichr(intval): - """ + """Create a unicode character from its integer value. In case `unichr` fails, render the character + as an escaped `\\U<8-byte hex value of intval>` string. Parameters ---------- @@ -1069,13 +1064,14 @@ def safe_unichr(intval): def decode_htmlentities(text): - """Decode HTML entities in text, coded as hex, decimal or named. - This function from [3]_. + """Decode all HTML entities in text that are encoded as hex, decimal or named entities. + Adapted from `python-twitter-ircbot/html_decode.py + `_. Parameters ---------- text : str - Input html text. + Input HTML. Examples -------- @@ -1089,10 +1085,6 @@ def decode_htmlentities(text): >>> print(decode_htmlentities("foo < bar")) foo < bar - References - ---------- - .. [3] http://github.com/sku/python-twitter-ircbot/blob/321d94e0e40d0acc92f5bf57d126b57369da70de/html_decode.py - """ def substitute_entity(match): try: @@ -1120,22 +1112,23 @@ def substitute_entity(match): def chunkize_serial(iterable, chunksize, as_numpy=False, dtype=np.float32): - """Give elements from the iterable in `chunksize`-ed lists. - The last returned element may be smaller (if length of collection is not divisible by `chunksize`). + """Yield elements from `iterable` in "chunksize"-ed groups. + + The last returned element may be smaller if the length of collection is not divisible by `chunksize`. Parameters ---------- iterable : iterable of object - Any iterable. + An iterable. chunksize : int - Size of chunk from result. + Split iterable into chunks of this size. as_numpy : bool, optional - If True - yield `np.ndarray`, otherwise - list + Yield chunks as `np.ndarray` instead of lists. Yields ------ - list of object OR np.ndarray - Groups based on `iterable` + list OR np.ndarray + "chunksize"-ed chunks of elements from `iterable`. Examples -------- @@ -1161,7 +1154,26 @@ def chunkize_serial(iterable, chunksize, as_numpy=False, dtype=np.float32): class InputQueue(multiprocessing.Process): + """Populate a queue of input chunks from a streamed corpus. + + Useful for reading and chunking corpora in the background, in a separate process, + so that workers that use the queue are not starved for input chunks. + + """ def __init__(self, q, corpus, chunksize, maxsize, as_numpy): + """ + Parameters + ---------- + q : multiprocessing.Queue + Enqueue chunks into this queue. + corpus : iterable of iterable of (int, numeric) + Corpus to read and split into "chunksize"-ed groups + chunksize : int + Split `corpus` into chunks of this size. + as_numpy : bool, optional + Enqueue chunks as `numpy.ndarray` instead of lists. + + """ super(InputQueue, self).__init__() self.q = q self.maxsize = maxsize @@ -1197,50 +1209,56 @@ def run(self): warnings.warn("detected Windows; aliasing chunkize to chunkize_serial") def chunkize(corpus, chunksize, maxsize=0, as_numpy=False): - """Split `corpus` into smaller chunks, used :func:`~gensim.utils.chunkize_serial`. + """Split `corpus` into fixed-sized chunks, using :func:`~gensim.utils.chunkize_serial`. Parameters ---------- corpus : iterable of object - Any iterable object. + An iterable. chunksize : int - Size of chunk from result. + Split `corpus` into chunks of this size. maxsize : int, optional - THIS PARAMETER IGNORED. + Ignored. For interface compatibility only. as_numpy : bool, optional - If True - yield `np.ndarray`, otherwise - list + Yield chunks as `np.ndarray`s instead of lists? Yields ------ - list of object OR np.ndarray - Groups based on `iterable` + list OR np.ndarray + "chunksize"-ed chunks of elements from `corpus`. """ for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy): yield chunk else: def chunkize(corpus, chunksize, maxsize=0, as_numpy=False): - """Split `corpus` into smaller chunks, used :func:`~gensim.utils.chunkize_serial`. + """Split `corpus` into fixed-sized chunks, using :func:`~gensim.utils.chunkize_serial`. Parameters ---------- corpus : iterable of object - Any iterable object. + An iterable. chunksize : int - Size of chunk from result. + Split `corpus` into chunks of this size. maxsize : int, optional - THIS PARAMETER IGNORED. + If > 0, prepare chunks in a background process, filling a chunk queue of size at most `maxsize`. as_numpy : bool, optional - If True - yield `np.ndarray`, otherwise - list + Yield chunks as `np.ndarray` instead of lists? + + Yields + ------ + list OR np.ndarray + "chunksize"-ed chunks of elements from `corpus`. Notes ----- Each chunk is of length `chunksize`, except the last one which may be smaller. A once-only input stream (`corpus` from a generator) is ok, chunking is done efficiently via itertools. - If `maxsize > 1`, don't wait idly in between successive chunk `yields`, but rather keep filling a short queue + If `maxsize > 0`, don't wait idly in between successive chunk `yields`, but rather keep filling a short queue (of size at most `maxsize`) with forthcoming chunks in advance. This is realized by starting a separate process, - and is meant to reduce I/O delays, which can be significant when `corpus` comes from a slow medium (like HDD). + and is meant to reduce I/O delays, which can be significant when `corpus` comes from a slow medium + like HDD, database or network. If `maxsize == 0`, don't fool around with parallelism and simply yield the chunksize via :func:`~gensim.utils.chunkize_serial` (no I/O optimizations). @@ -1269,19 +1287,27 @@ def chunkize(corpus, chunksize, maxsize=0, as_numpy=False): def smart_extension(fname, ext): - """Generate filename with `ext`. + """Append a file extension `ext` to `fname`, while keeping compressed extensions like `.bz2` or + `.gz` (if any) at the end. Parameters ---------- fname : str - Path to file. + Filename or full path. ext : str - File extension. + Extension to append before any compression extensions. Returns ------- str - New path to file with `ext`. + New path to file with `ext` appended. + + Examples + -------- + + >>> from gensim.utils import smart_extension + >>> smart_extension("my_file.pkl.gz", ".vectors") + 'my_file.pkl.vectors.gz' """ fname, oext = os.path.splitext(fname) @@ -1296,7 +1322,7 @@ def smart_extension(fname, ext): def pickle(obj, fname, protocol=2): - """Pickle object `obj` to file `fname`. + """Pickle object `obj` to file `fname`, using smart_open so that `fname` can be on S3, HDFS, compressed etc. Parameters ---------- @@ -1305,7 +1331,7 @@ def pickle(obj, fname, protocol=2): fname : str Path to pickle file. protocol : int, optional - Pickle protocol number, default is 2 to support compatible across python 2.x and 3.x. + Pickle protocol number. Default is 2 in order to support compatibility across python 2.x and 3.x. """ with smart_open(fname, 'wb') as fout: # 'b' for binary, needed on Windows @@ -1313,7 +1339,7 @@ def pickle(obj, fname, protocol=2): def unpickle(fname): - """Load object from `fname`. + """Load object from `fname`, using smart_open so that `fname` can be on S3, HDFS, compressed etc. Parameters ---------- @@ -1363,7 +1389,10 @@ def revdict(d): def deprecated(reason): - """Decorator which can be used to mark functions as deprecated. + """Decorator to mark functions as deprecated. + + Calling a decorated function will result in a warning being emitted, using warnings.warn. + Adapted from https://stackoverflow.com/a/40301488/8001386. Parameters ---------- @@ -1375,14 +1404,6 @@ def deprecated(reason): function Decorated function - Notes - ----- - It will result in a warning being emitted when the function is used, base code from [4]_. - - References - ---------- - .. [4] https://stackoverflow.com/a/40301488/8001386 - """ if isinstance(reason, string_types): def decorator(func): @@ -1420,19 +1441,18 @@ def new_func2(*args, **kwargs): @deprecated("Function will be removed in 4.0.0") def toptexts(query, texts, index, n=10): - """ - Debug fnc to help inspect the top `n` most similar documents (according to a - similarity index `index`), to see if they are actually related to the query. + """Debug fnc to help inspect the top `n` most similar documents (according to a similarity index `index`), + to see if they are actually related to the query. Parameters ---------- - query : list + query : {list of (int, number), numpy.ndarray} vector OR BoW (list of tuples) texts : str object that can return something insightful for each document via `texts[docid]`, such as its fulltext or snippet. index : any - a class from gensim.similarity.docsim + A instance from from :mod:`gensim.similarity.docsim`. Return ------ @@ -1447,7 +1467,7 @@ def toptexts(query, texts, index, n=10): def randfname(prefix='gensim'): - """Generate path with random filename/ + """Generate a random filename in temp. Parameters ---------- @@ -1457,7 +1477,7 @@ def randfname(prefix='gensim'): Returns ------- str - Full path with random filename (in temporary folder). + Full path in the in system's temporary folder, ending in a random filename. """ randpart = hex(random.randint(0, 0xffffff))[2:] @@ -1467,11 +1487,13 @@ def randfname(prefix='gensim'): @deprecated("Function will be removed in 4.0.0") def upload_chunked(server, docs, chunksize=1000, preprocess=None): """Memory-friendly upload of documents to a SimServer (or Pyro SimServer proxy). + Notes ----- Use this function to train or index large collections -- avoid sending the entire corpus over the wire as a single Pyro in-memory object. The documents will be sent in smaller chunks, of `chunksize` documents each. + """ start = 0 for chunk in grouper(docs, chunksize): @@ -1494,18 +1516,18 @@ def getNS(host=None, port=None, broadcast=True, hmac_key=None): Parameters ---------- host : str, optional - Hostname of ns. + Name server hostname. port : int, optional - Port of ns. + Name server port. broadcast : bool, optional - If True - use broadcast mechanism (i.e. all Pyro nodes in local network), not otherwise. + Use broadcast mechanism? (i.e. reach out to all Pyro nodes in the network) hmac_key : str, optional Private key. Raises ------ RuntimeError - when Pyro name server is not found + When Pyro name server is not found. Returns ------- @@ -1521,9 +1543,10 @@ def getNS(host=None, port=None, broadcast=True, hmac_key=None): def pyro_daemon(name, obj, random_suffix=False, ip=None, port=None, ns_conf=None): - """Register object with name server (starting the name server if not running - yet) and block until the daemon is terminated. The object is registered under - `name`, or `name`+ some random suffix if `random_suffix` is set. + """Register an object with the Pyro name server. + + Start the name server if not running yet and block until the daemon is terminated. + The object is registered under `name`, or `name`+ some random suffix if `random_suffix` is set. """ if ns_conf is None: @@ -1543,16 +1566,12 @@ def pyro_daemon(name, obj, random_suffix=False, ip=None, port=None, ns_conf=None def has_pattern(): - """Check that `pattern` [5]_ package already installed. + """Check whether the `pattern `_ package is installed. Returns ------- bool - True if `pattern` installed, False otherwise. - - References - ---------- - .. [5] https://github.com/clips/pattern + Is `pattern` installed? """ try: @@ -1564,8 +1583,9 @@ def has_pattern(): def lemmatize(content, allowed_tags=re.compile(r'(NN|VB|JJ|RB)'), light=False, stopwords=frozenset(), min_length=2, max_length=15): - """Use the English lemmatizer from `pattern` [5]_ to extract UTF8-encoded tokens in - their base form=lemma, e.g. "are, is, being" -> "be" etc. + """Use the English lemmatizer from `pattern `_ to extract UTF8-encoded tokens in + their base form aka lemma, e.g. "are, is, being" becomes "be" etc. + This is a smarter version of stemming, taking word context into account. Parameters @@ -1577,29 +1597,38 @@ def lemmatize(content, allowed_tags=re.compile(r'(NN|VB|JJ|RB)'), light=False, Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded). light : bool, optional DEPRECATED FLAG, DOESN'T SUPPORT BY `pattern`. - stopwords : frozenset + stopwords : frozenset, optional Set of words that will be removed from output. - min_length : int + min_length : int, optional Minimal token length in output (inclusive). - max_length : int + max_length : int, optional Maximal token length in output (inclusive). Returns ------- list of str - List with tokens with POS tag. + List with tokens with POS tags. Warnings -------- - This function is only available when the optional 'pattern' package is installed. + This function is only available when the optional `pattern `_ is installed. + + Raises + ------ + ImportError + If `pattern `_ not installed. Examples -------- >>> from gensim.utils import lemmatize >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21') ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN'] + + Note the context-dependent part-of-speech tags between these two examples: + >>> lemmatize('The study ranks high.') ['study/NN', 'rank/VB', 'high/JJ'] + >>> lemmatize('The ranks study hard.') ['rank/NN', 'study/VB', 'hard/RB'] @@ -1632,16 +1661,16 @@ def lemmatize(content, allowed_tags=re.compile(r'(NN|VB|JJ|RB)'), light=False, def mock_data_row(dim=1000, prob_nnz=0.5, lam=1.0): - """Create a random gensim BoW vector. + """Create a random gensim BoW vector, with the feature counts following the Poisson distribution. Parameters ---------- dim : int, optional Dimension of vector. prob_nnz : float, optional - Probability of each coordinate will be nonzero, will be drawn from Poisson distribution. + Probability of each coordinate will be nonzero, will be drawn from the Poisson distribution. lam : float, optional - Parameter for Poisson distribution. + Lambda parameter for the Poisson distribution. Returns ------- @@ -1654,7 +1683,7 @@ def mock_data_row(dim=1000, prob_nnz=0.5, lam=1.0): def mock_data(n_items=1000, dim=1000, prob_nnz=0.5, lam=1.0): - """Create a random gensim-style corpus (BoW), used :func:`~gensim.utils.mock_data_row`. + """Create a random Gensim-style corpus (BoW), using :func:`~gensim.utils.mock_data_row`. Parameters ---------- @@ -1681,6 +1710,7 @@ def prune_vocab(vocab, min_reduce, trim_rule=None): """Remove all entries from the `vocab` dictionary with count smaller than `min_reduce`. Modifies `vocab` in place, returns the sum of all counts that were pruned. + Parameters ---------- vocab : dict @@ -1736,18 +1766,19 @@ def qsize(queue): def keep_vocab_item(word, count, min_count, trim_rule=None): - """Check that should we keep `word` in vocab or remove. + """Should we keep `word` in the vocab or remove it? Parameters ---------- word : str Input word. count : int - Number of times that word contains in corpus. + Number of times that word appeared in a corpus. min_count : int - Frequency threshold for `word`. + Discard words with frequency smaller than this. trim_rule : function, optional - Function for trimming entities from vocab, default behaviour is `vocab[w] <= min_reduce`. + Custom function to decide whether to keep or discard this word. + If a custom `trim_rule` is not specified, the default behaviour is simply `count >= min_count`. Returns ------- @@ -1770,11 +1801,10 @@ def keep_vocab_item(word, count, min_count, trim_rule=None): def check_output(stdout=subprocess.PIPE, *popenargs, **kwargs): - r"""Run command with arguments and return its output as a byte string. - Backported from Python 2.7 as it's implemented as pure python on stdlib + small modification. - Widely used for :mod:`gensim.models.wrappers`. + r"""Run OS command with the given arguments and return its output as a byte string. - Very similar with [6]_ + Backported from Python 2.7 with a few minor modifications. Widely used for :mod:`gensim.models.wrappers`. + Behaves very similar to https://docs.python.org/2/library/subprocess.html#subprocess.check_output. Examples -------- @@ -1787,10 +1817,6 @@ def check_output(stdout=subprocess.PIPE, *popenargs, **kwargs): KeyboardInterrupt If Ctrl+C pressed. - References - ---------- - .. [6] https://docs.python.org/2/library/subprocess.html#subprocess.check_output - """ try: logger.debug("COMMAND: %s %s", popenargs, kwargs) @@ -1811,21 +1837,21 @@ def check_output(stdout=subprocess.PIPE, *popenargs, **kwargs): def sample_dict(d, n=10, use_random=True): - """Pick `n` items from dictionary `d`. + """Selected `n` (possibly random) items from the dictionary `d`. Parameters ---------- d : dict Input dictionary. n : int, optional - Number of items that will be picked. + Number of items to select. use_random : bool, optional - If True - pick items randomly, otherwise - according to natural dict iteration. + Select items randomly (without replacement), instead of by the natural dict iteration order? Returns ------- list of (object, object) - Picked items from dictionary, represented as list. + Selected items from dictionary, as a list. """ selected_keys = random.sample(list(d), min(len(d), n)) if use_random else itertools.islice(iterkeys(d), n) @@ -1880,10 +1906,10 @@ def strided_windows(ndarray, window_size): def iter_windows(texts, window_size, copy=False, ignore_below_size=True, include_doc_num=False): """Produce a generator over the given texts using a sliding window of `window_size`. + The windows produced are views of some subsequence of a text. To use deep copies instead, pass `copy=True`. - Parameters ---------- texts : list of str @@ -1891,11 +1917,11 @@ def iter_windows(texts, window_size, copy=False, ignore_below_size=True, include window_size : int Size of sliding window. copy : bool, optional - If True - produce deep copies. + Produce deep copies. ignore_below_size : bool, optional - If True - ignore documents that are not at least `window_size` in length. + Ignore documents that are not at least `window_size` in length? include_doc_num : bool, optional - If True - will be yield doc_num too. + Yield the text position with `texts` along with each window? """ for doc_num, document in enumerate(texts): @@ -1917,18 +1943,18 @@ def _iter_windows(document, window_size, copy=False, ignore_below_size=True): def flatten(nested_list): - """Recursively flatten out a nested list. + """Recursively flatten a nested sequence of elements. Parameters ---------- - nested_list : list - Possibly nested list. + nested_list : iterable + Possibly nested sequence of elements to flatten. Returns ------- list - Flattened version of input, where any list elements have been unpacked into the top-level list - in a recursive fashion. + Flattened version of `nested_list` where any elements that are an iterable (`collections.Iterable`) + have been unpacked into the top-level list, in a recursive fashion. """ return list(lazy_flatten(nested_list))