From 8ee1b4fb4772f51a9893660d951e2fedf4e36aa6 Mon Sep 17 00:00:00 2001 From: chengscu Date: Thu, 4 Jun 2020 11:19:33 +0800 Subject: [PATCH] Add transformers among formats CSV, Json, KV, Columns and Triple. See #93 --- .../alink/common/mapper/FlatMapper.java | 5 + .../common/mapper/FlatMapperAdapter.java | 15 +- .../dataproc/format/AnyToTripleBatchOp.java | 27 +++ .../format/BaseFormatTransBatchOp.java | 30 +++ .../dataproc/format/ColumnsToCsvBatchOp.java | 18 ++ .../dataproc/format/ColumnsToJsonBatchOp.java | 18 ++ .../dataproc/format/ColumnsToKvBatchOp.java | 18 ++ .../format/ColumnsToTripleBatchOp.java | 20 ++ .../format/ColumnsToVectorBatchOp.java | 18 ++ .../dataproc/format/CsvToColumnsBatchOp.java | 18 ++ .../dataproc/format/CsvToJsonBatchOp.java | 18 ++ .../batch/dataproc/format/CsvToKvBatchOp.java | 18 ++ .../dataproc/format/CsvToTripleBatchOp.java | 20 ++ .../dataproc/format/CsvToVectorBatchOp.java | 18 ++ .../dataproc/format/JsonToColumnsBatchOp.java | 18 ++ .../dataproc/format/JsonToCsvBatchOp.java | 18 ++ .../dataproc/format/JsonToKvBatchOp.java | 18 ++ .../dataproc/format/JsonToTripleBatchOp.java | 20 ++ .../dataproc/format/JsonToVectorBatchOp.java | 18 ++ .../dataproc/format/KvToColumnsBatchOp.java | 18 ++ .../batch/dataproc/format/KvToCsvBatchOp.java | 18 ++ .../dataproc/format/KvToJsonBatchOp.java | 18 ++ .../dataproc/format/KvToTripleBatchOp.java | 17 ++ .../dataproc/format/KvToVectorBatchOp.java | 18 ++ .../dataproc/format/TripleToAnyBatchOp.java | 127 ++++++++++ .../format/TripleToColumnsBatchOp.java | 17 ++ .../dataproc/format/TripleToCsvBatchOp.java | 17 ++ .../dataproc/format/TripleToJsonBatchOp.java | 17 ++ .../dataproc/format/TripleToKvBatchOp.java | 17 ++ .../format/TripleToVectorBatchOp.java | 26 +++ .../format/VectorToColumnsBatchOp.java | 19 ++ .../dataproc/format/VectorToCsvBatchOp.java | 18 ++ .../dataproc/format/VectorToJsonBatchOp.java | 18 ++ .../dataproc/format/VectorToKvBatchOp.java | 18 ++ .../format/VectorToTripleBatchOp.java | 21 ++ .../format/AnyToTripleFlatMapper.java | 110 +++++++++ .../common/dataproc/format/ColumnsReader.java | 28 +++ .../common/dataproc/format/ColumnsWriter.java | 90 ++++++++ .../common/dataproc/format/CsvReader.java | 37 +++ .../common/dataproc/format/CsvWriter.java | 57 +++++ .../common/dataproc/format/FormatReader.java | 11 + .../dataproc/format/FormatTransMapper.java | 218 ++++++++++++++++++ .../dataproc/format/FormatTransParams.java | 20 ++ .../common/dataproc/format/FormatType.java | 9 + .../common/dataproc/format/FormatWriter.java | 13 ++ .../common/dataproc/format/JsonReader.java | 29 +++ .../common/dataproc/format/JsonWriter.java | 18 ++ .../common/dataproc/format/KvReader.java | 38 +++ .../common/dataproc/format/KvWriter.java | 33 +++ .../common/dataproc/format/VectorReader.java | 59 +++++ .../common/dataproc/format/VectorWriter.java | 72 ++++++ .../dataproc/format/AnyToTripleStreamOp.java | 27 +++ .../format/BaseFormatTransStreamOp.java | 29 +++ .../dataproc/format/ColumnsToCsvStreamOp.java | 18 ++ .../format/ColumnsToJsonStreamOp.java | 18 ++ .../dataproc/format/ColumnsToKvStreamOp.java | 18 ++ .../format/ColumnsToTripleStreamOp.java | 16 ++ .../format/ColumnsToVectorStreamOp.java | 18 ++ .../dataproc/format/CsvToColumnsStreamOp.java | 18 ++ .../dataproc/format/CsvToJsonStreamOp.java | 18 ++ .../dataproc/format/CsvToKvStreamOp.java | 18 ++ .../dataproc/format/CsvToTripleStreamOp.java | 16 ++ .../dataproc/format/CsvToVectorStreamOp.java | 18 ++ .../format/JsonToColumnsStreamOp.java | 18 ++ .../dataproc/format/JsonToCsvStreamOp.java | 18 ++ .../dataproc/format/JsonToKvStreamOp.java | 18 ++ .../dataproc/format/JsonToTripleStreamOp.java | 16 ++ .../dataproc/format/JsonToVectorStreamOp.java | 18 ++ .../dataproc/format/KvToColumnsStreamOp.java | 18 ++ .../dataproc/format/KvToCsvStreamOp.java | 18 ++ .../dataproc/format/KvToJsonStreamOp.java | 18 ++ .../dataproc/format/KvToTripleStreamOp.java | 16 ++ .../dataproc/format/KvToVectorStreamOp.java | 18 ++ .../format/VectorToColumnsStreamOp.java | 20 ++ .../dataproc/format/VectorToCsvStreamOp.java | 18 ++ .../dataproc/format/VectorToJsonStreamOp.java | 18 ++ .../dataproc/format/VectorToKvStreamOp.java | 18 ++ .../format/VectorToTripleStreamOp.java | 16 ++ .../dataproc/format/ColumnsToCsvParams.java | 8 + .../dataproc/format/ColumnsToJsonParams.java | 8 + .../dataproc/format/ColumnsToKvParams.java | 8 + .../format/ColumnsToTripleParams.java | 5 + .../format/ColumnsToVectorParams.java | 8 + .../dataproc/format/CsvToColumnsParams.java | 8 + .../dataproc/format/CsvToJsonParams.java | 8 + .../params/dataproc/format/CsvToKvParams.java | 8 + .../dataproc/format/CsvToTripleParams.java | 6 + .../dataproc/format/CsvToVectorParams.java | 8 + .../dataproc/format/FromColumnsParams.java | 7 + .../params/dataproc/format/FromCsvParams.java | 11 + .../dataproc/format/FromJsonParams.java | 5 + .../params/dataproc/format/FromKvParams.java | 7 + .../dataproc/format/FromTripleParams.java | 46 ++++ .../dataproc/format/FromVectorParams.java | 7 + .../params/dataproc/format/HasCsvCol.java | 29 +++ .../HasCsvFieldDelimiterDefaultAsComma.java | 25 ++ .../HasHandleInvalidDefaultAsError.java | 44 ++++ .../format/HasHandleInvalidDefaultAsSkip.java | 44 ++++ .../params/dataproc/format/HasJsonCol.java | 29 +++ .../params/dataproc/format/HasKvCol.java | 30 +++ .../HasKvColDelimiterDefaultAsComma.java | 26 +++ .../HasKvValDelimiterDefaultAsColon.java | 26 +++ .../dataproc/format/HasTripleRowCol.java | 28 +++ .../params/dataproc/format/HasVectorSize.java | 33 +++ .../dataproc/format/JsonToColumnsParams.java | 8 + .../dataproc/format/JsonToCsvParams.java | 8 + .../dataproc/format/JsonToKvParams.java | 8 + .../dataproc/format/JsonToTripleParams.java | 5 + .../dataproc/format/JsonToVectorParams.java | 8 + .../dataproc/format/KvToColumnsParams.java | 8 + .../params/dataproc/format/KvToCsvParams.java | 8 + .../dataproc/format/KvToJsonParams.java | 8 + .../dataproc/format/KvToTripleParams.java | 5 + .../dataproc/format/KvToVectorParams.java | 8 + .../dataproc/format/ToColumnsParams.java | 9 + .../params/dataproc/format/ToCsvParams.java | 13 ++ .../params/dataproc/format/ToJsonParams.java | 8 + .../params/dataproc/format/ToKvParams.java | 10 + .../dataproc/format/ToTripleParams.java | 46 ++++ .../dataproc/format/ToVectorParams.java | 10 + .../format/TripleToColumnsParams.java | 5 + .../dataproc/format/TripleToCsvParams.java | 5 + .../dataproc/format/TripleToJsonParams.java | 5 + .../dataproc/format/TripleToKvParams.java | 5 + .../dataproc/format/TripleToVectorParams.java | 5 + .../format/VectorToColumnsParams.java | 8 + .../dataproc/format/VectorToCsvParams.java | 8 + .../dataproc/format/VectorToJsonParams.java | 8 + .../dataproc/format/VectorToKvParams.java | 8 + .../dataproc/format/VectorToTripleParams.java | 5 + .../colname/HasReservedColsDefaultAsNull.java | 26 +++ .../dataproc/format/BaseFormatTrans.java | 21 ++ .../dataproc/format/ColumnsToCsv.java | 18 ++ .../dataproc/format/ColumnsToJson.java | 18 ++ .../pipeline/dataproc/format/ColumnsToKv.java | 18 ++ .../dataproc/format/ColumnsToVector.java | 18 ++ .../dataproc/format/CsvToColumns.java | 18 ++ .../pipeline/dataproc/format/CsvToJson.java | 18 ++ .../pipeline/dataproc/format/CsvToKv.java | 18 ++ .../pipeline/dataproc/format/CsvToVector.java | 18 ++ .../dataproc/format/JsonToColumns.java | 18 ++ .../pipeline/dataproc/format/JsonToCsv.java | 18 ++ .../pipeline/dataproc/format/JsonToKv.java | 18 ++ .../dataproc/format/JsonToVector.java | 18 ++ .../pipeline/dataproc/format/KvToColumns.java | 18 ++ .../pipeline/dataproc/format/KvToCsv.java | 18 ++ .../pipeline/dataproc/format/KvToJson.java | 18 ++ .../pipeline/dataproc/format/KvToVector.java | 18 ++ .../dataproc/format/VectorToColumns.java | 18 ++ .../pipeline/dataproc/format/VectorToCsv.java | 18 ++ .../dataproc/format/VectorToJson.java | 18 ++ .../pipeline/dataproc/format/VectorToKv.java | 18 ++ .../format/AnyToTripleFlatMapperTest.java | 39 ++++ .../format/FormatTransMapperTest.java | 216 +++++++++++++++++ .../format/TripleToAnyBatchOpTest.java | 66 ++++++ 155 files changed, 3527 insertions(+), 2 deletions(-) create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/AnyToTripleBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/BaseFormatTransBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/ColumnsToCsvBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/ColumnsToJsonBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/ColumnsToKvBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/ColumnsToTripleBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/ColumnsToVectorBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/CsvToColumnsBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/CsvToJsonBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/CsvToKvBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/CsvToTripleBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/CsvToVectorBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/JsonToColumnsBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/JsonToCsvBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/JsonToKvBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/JsonToTripleBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/JsonToVectorBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/KvToColumnsBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/KvToCsvBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/KvToJsonBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/KvToTripleBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/KvToVectorBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToAnyBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToColumnsBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToCsvBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToJsonBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToKvBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToVectorBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/VectorToColumnsBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/VectorToCsvBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/VectorToJsonBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/VectorToKvBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/VectorToTripleBatchOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/AnyToTripleFlatMapper.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/ColumnsReader.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/ColumnsWriter.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/CsvReader.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/CsvWriter.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/FormatReader.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/FormatTransMapper.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/FormatTransParams.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/FormatType.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/FormatWriter.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/JsonReader.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/JsonWriter.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/KvReader.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/KvWriter.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/VectorReader.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/VectorWriter.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/AnyToTripleStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/BaseFormatTransStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/ColumnsToCsvStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/ColumnsToJsonStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/ColumnsToKvStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/ColumnsToTripleStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/ColumnsToVectorStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/CsvToColumnsStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/CsvToJsonStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/CsvToKvStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/CsvToTripleStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/CsvToVectorStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/JsonToColumnsStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/JsonToCsvStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/JsonToKvStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/JsonToTripleStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/JsonToVectorStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/KvToColumnsStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/KvToCsvStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/KvToJsonStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/KvToTripleStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/KvToVectorStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/VectorToColumnsStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/VectorToCsvStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/VectorToJsonStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/VectorToKvStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/VectorToTripleStreamOp.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/ColumnsToCsvParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/ColumnsToJsonParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/ColumnsToKvParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/ColumnsToTripleParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/ColumnsToVectorParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/CsvToColumnsParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/CsvToJsonParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/CsvToKvParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/CsvToTripleParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/CsvToVectorParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/FromColumnsParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/FromCsvParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/FromJsonParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/FromKvParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/FromTripleParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/FromVectorParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/HasCsvCol.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/HasCsvFieldDelimiterDefaultAsComma.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/HasHandleInvalidDefaultAsError.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/HasHandleInvalidDefaultAsSkip.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/HasJsonCol.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/HasKvCol.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/HasKvColDelimiterDefaultAsComma.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/HasKvValDelimiterDefaultAsColon.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/HasTripleRowCol.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/HasVectorSize.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/JsonToColumnsParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/JsonToCsvParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/JsonToKvParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/JsonToTripleParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/JsonToVectorParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/KvToColumnsParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/KvToCsvParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/KvToJsonParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/KvToTripleParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/KvToVectorParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/ToColumnsParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/ToCsvParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/ToJsonParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/ToKvParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/ToTripleParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/ToVectorParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/TripleToColumnsParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/TripleToCsvParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/TripleToJsonParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/TripleToKvParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/TripleToVectorParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/VectorToColumnsParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/VectorToCsvParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/VectorToJsonParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/VectorToKvParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/dataproc/format/VectorToTripleParams.java create mode 100644 core/src/main/java/com/alibaba/alink/params/shared/colname/HasReservedColsDefaultAsNull.java create mode 100644 core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/BaseFormatTrans.java create mode 100644 core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/ColumnsToCsv.java create mode 100644 core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/ColumnsToJson.java create mode 100644 core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/ColumnsToKv.java create mode 100644 core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/ColumnsToVector.java create mode 100644 core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/CsvToColumns.java create mode 100644 core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/CsvToJson.java create mode 100644 core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/CsvToKv.java create mode 100644 core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/CsvToVector.java create mode 100644 core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/JsonToColumns.java create mode 100644 core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/JsonToCsv.java create mode 100644 core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/JsonToKv.java create mode 100644 core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/JsonToVector.java create mode 100644 core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/KvToColumns.java create mode 100644 core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/KvToCsv.java create mode 100644 core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/KvToJson.java create mode 100644 core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/KvToVector.java create mode 100644 core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/VectorToColumns.java create mode 100644 core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/VectorToCsv.java create mode 100644 core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/VectorToJson.java create mode 100644 core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/VectorToKv.java create mode 100644 core/src/test/java/com/alibaba/alink/operator/common/dataproc/format/AnyToTripleFlatMapperTest.java create mode 100644 core/src/test/java/com/alibaba/alink/operator/common/dataproc/format/FormatTransMapperTest.java create mode 100644 core/src/test/java/com/alibaba/alink/operator/common/dataproc/format/TripleToAnyBatchOpTest.java diff --git a/core/src/main/java/com/alibaba/alink/common/mapper/FlatMapper.java b/core/src/main/java/com/alibaba/alink/common/mapper/FlatMapper.java index 169eb3b5a..d5fc201fd 100644 --- a/core/src/main/java/com/alibaba/alink/common/mapper/FlatMapper.java +++ b/core/src/main/java/com/alibaba/alink/common/mapper/FlatMapper.java @@ -54,4 +54,9 @@ protected TableSchema getDataSchema() { */ public abstract TableSchema getOutputSchema(); + public void open() { + } + + public void close() { + } } diff --git a/core/src/main/java/com/alibaba/alink/common/mapper/FlatMapperAdapter.java b/core/src/main/java/com/alibaba/alink/common/mapper/FlatMapperAdapter.java index 39d7108b9..6f061b3b0 100644 --- a/core/src/main/java/com/alibaba/alink/common/mapper/FlatMapperAdapter.java +++ b/core/src/main/java/com/alibaba/alink/common/mapper/FlatMapperAdapter.java @@ -2,14 +2,15 @@ import java.io.Serializable; -import org.apache.flink.api.common.functions.FlatMapFunction; +import org.apache.flink.api.common.functions.RichFlatMapFunction; +import org.apache.flink.configuration.Configuration; import org.apache.flink.types.Row; import org.apache.flink.util.Collector; /** * Adapt a {@link Mapper} to run within flink. */ -public class FlatMapperAdapter implements FlatMapFunction, Serializable { +public class FlatMapperAdapter extends RichFlatMapFunction implements Serializable { private final FlatMapper mapper; @@ -17,6 +18,16 @@ public FlatMapperAdapter(FlatMapper mapper) { this.mapper = mapper; } + @Override + public void open(Configuration parameters) throws Exception { + mapper.open(); + } + + @Override + public void close() throws Exception { + mapper.close(); + } + @Override public void flatMap(Row value, Collector out) throws Exception { this.mapper.flatMap(value, out); diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/AnyToTripleBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/AnyToTripleBatchOp.java new file mode 100644 index 000000000..89f450592 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/AnyToTripleBatchOp.java @@ -0,0 +1,27 @@ +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.batch.utils.FlatMapBatchOp; +import com.alibaba.alink.operator.common.dataproc.format.AnyToTripleFlatMapper; +import com.alibaba.alink.operator.common.dataproc.format.FormatTransParams; +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.ToTripleParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class AnyToTripleBatchOp> extends FlatMapBatchOp + implements ToTripleParams { + + public AnyToTripleBatchOp() { + this(null); + } + + public AnyToTripleBatchOp(FormatType formatType, Params params) { + this( + (null == params ? new Params() : params) + .set(FormatTransParams.FROM_FORMAT, formatType) + ); + } + + public AnyToTripleBatchOp(Params params) { + super(AnyToTripleFlatMapper::new, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/BaseFormatTransBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/BaseFormatTransBatchOp.java new file mode 100644 index 000000000..cd7cb9cec --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/BaseFormatTransBatchOp.java @@ -0,0 +1,30 @@ +package com.alibaba.alink.operator.batch.dataproc.format; + +import org.apache.flink.ml.api.misc.param.Params; + +import com.alibaba.alink.operator.batch.utils.MapBatchOp; +import com.alibaba.alink.operator.common.dataproc.format.FormatTransMapper; +import com.alibaba.alink.operator.common.dataproc.format.FormatTransParams; +import com.alibaba.alink.operator.common.dataproc.format.FormatType; + +/** + * Transform vector to table columns. This transformer will map vector column to columns as designed. + */ +public class BaseFormatTransBatchOp> extends MapBatchOp { + + private BaseFormatTransBatchOp() { + this(null); + } + + public BaseFormatTransBatchOp(FormatType fromFormat, FormatType toFormat, Params params) { + this( + (null == params ? new Params() : params) + .set(FormatTransParams.FROM_FORMAT, fromFormat) + .set(FormatTransParams.TO_FORMAT, toFormat) + ); + } + + private BaseFormatTransBatchOp(Params params) { + super(FormatTransMapper::new, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/ColumnsToCsvBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/ColumnsToCsvBatchOp.java new file mode 100644 index 000000000..685bb7578 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/ColumnsToCsvBatchOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.ColumnsToCsvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class ColumnsToCsvBatchOp extends BaseFormatTransBatchOp + implements ColumnsToCsvParams { + + public ColumnsToCsvBatchOp() { + this(new Params()); + } + + public ColumnsToCsvBatchOp(Params params) { + super(FormatType.COLUMNS, FormatType.CSV, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/ColumnsToJsonBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/ColumnsToJsonBatchOp.java new file mode 100644 index 000000000..84b0b2bae --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/ColumnsToJsonBatchOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.ColumnsToJsonParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class ColumnsToJsonBatchOp extends BaseFormatTransBatchOp + implements ColumnsToJsonParams { + + public ColumnsToJsonBatchOp() { + this(new Params()); + } + + public ColumnsToJsonBatchOp(Params params) { + super(FormatType.COLUMNS, FormatType.JSON, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/ColumnsToKvBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/ColumnsToKvBatchOp.java new file mode 100644 index 000000000..bcf86fada --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/ColumnsToKvBatchOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.ColumnsToKvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class ColumnsToKvBatchOp extends BaseFormatTransBatchOp + implements ColumnsToKvParams { + + public ColumnsToKvBatchOp() { + this(new Params()); + } + + public ColumnsToKvBatchOp(Params params) { + super(FormatType.COLUMNS, FormatType.KV, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/ColumnsToTripleBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/ColumnsToTripleBatchOp.java new file mode 100644 index 000000000..27964aafb --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/ColumnsToTripleBatchOp.java @@ -0,0 +1,20 @@ +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.ColumnsToTripleParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class ColumnsToTripleBatchOp extends AnyToTripleBatchOp + implements ColumnsToTripleParams { + + private static final long serialVersionUID = 7543648266815893977L; + + public ColumnsToTripleBatchOp() { + this(new Params()); + } + + public ColumnsToTripleBatchOp(Params params) { + super(FormatType.COLUMNS, params); + } + +} \ No newline at end of file diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/ColumnsToVectorBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/ColumnsToVectorBatchOp.java new file mode 100644 index 000000000..074a1bd14 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/ColumnsToVectorBatchOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.ColumnsToVectorParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class ColumnsToVectorBatchOp extends BaseFormatTransBatchOp + implements ColumnsToVectorParams { + + public ColumnsToVectorBatchOp() { + this(new Params()); + } + + public ColumnsToVectorBatchOp(Params params) { + super(FormatType.COLUMNS, FormatType.VECTOR, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/CsvToColumnsBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/CsvToColumnsBatchOp.java new file mode 100644 index 000000000..6c45183d1 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/CsvToColumnsBatchOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.CsvToColumnsParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class CsvToColumnsBatchOp extends BaseFormatTransBatchOp + implements CsvToColumnsParams { + + public CsvToColumnsBatchOp() { + this(new Params()); + } + + public CsvToColumnsBatchOp(Params params) { + super(FormatType.CSV, FormatType.COLUMNS, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/CsvToJsonBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/CsvToJsonBatchOp.java new file mode 100644 index 000000000..a6cad99c5 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/CsvToJsonBatchOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.CsvToJsonParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class CsvToJsonBatchOp extends BaseFormatTransBatchOp + implements CsvToJsonParams { + + public CsvToJsonBatchOp() { + this(new Params()); + } + + public CsvToJsonBatchOp(Params params) { + super(FormatType.CSV, FormatType.JSON, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/CsvToKvBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/CsvToKvBatchOp.java new file mode 100644 index 000000000..2a113ce90 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/CsvToKvBatchOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.CsvToKvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class CsvToKvBatchOp extends BaseFormatTransBatchOp + implements CsvToKvParams { + + public CsvToKvBatchOp() { + this(new Params()); + } + + public CsvToKvBatchOp(Params params) { + super(FormatType.CSV, FormatType.KV, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/CsvToTripleBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/CsvToTripleBatchOp.java new file mode 100644 index 000000000..112da8c8f --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/CsvToTripleBatchOp.java @@ -0,0 +1,20 @@ +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.CsvToTripleParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class CsvToTripleBatchOp extends AnyToTripleBatchOp + implements CsvToTripleParams { + + private static final long serialVersionUID = 7543648266815893977L; + + public CsvToTripleBatchOp() { + this(new Params()); + } + + public CsvToTripleBatchOp(Params params) { + super(FormatType.CSV, params); + } + +} \ No newline at end of file diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/CsvToVectorBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/CsvToVectorBatchOp.java new file mode 100644 index 000000000..e908056bd --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/CsvToVectorBatchOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.CsvToVectorParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class CsvToVectorBatchOp extends BaseFormatTransBatchOp + implements CsvToVectorParams { + + public CsvToVectorBatchOp() { + this(new Params()); + } + + public CsvToVectorBatchOp(Params params) { + super(FormatType.CSV, FormatType.VECTOR, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/JsonToColumnsBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/JsonToColumnsBatchOp.java new file mode 100644 index 000000000..3085b6fc3 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/JsonToColumnsBatchOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.JsonToColumnsParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class JsonToColumnsBatchOp extends BaseFormatTransBatchOp + implements JsonToColumnsParams { + + public JsonToColumnsBatchOp() { + this(new Params()); + } + + public JsonToColumnsBatchOp(Params params) { + super(FormatType.JSON, FormatType.COLUMNS, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/JsonToCsvBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/JsonToCsvBatchOp.java new file mode 100644 index 000000000..4559d80dd --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/JsonToCsvBatchOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.JsonToCsvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class JsonToCsvBatchOp extends BaseFormatTransBatchOp + implements JsonToCsvParams { + + public JsonToCsvBatchOp() { + this(new Params()); + } + + public JsonToCsvBatchOp(Params params) { + super(FormatType.JSON, FormatType.CSV, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/JsonToKvBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/JsonToKvBatchOp.java new file mode 100644 index 000000000..a3dfabaec --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/JsonToKvBatchOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.JsonToKvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class JsonToKvBatchOp extends BaseFormatTransBatchOp + implements JsonToKvParams { + + public JsonToKvBatchOp() { + this(new Params()); + } + + public JsonToKvBatchOp(Params params) { + super(FormatType.JSON, FormatType.KV, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/JsonToTripleBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/JsonToTripleBatchOp.java new file mode 100644 index 000000000..2da6c02e1 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/JsonToTripleBatchOp.java @@ -0,0 +1,20 @@ +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.JsonToTripleParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class JsonToTripleBatchOp extends AnyToTripleBatchOp + implements JsonToTripleParams { + + private static final long serialVersionUID = 7543648266815893977L; + + public JsonToTripleBatchOp() { + this(new Params()); + } + + public JsonToTripleBatchOp(Params params) { + super(FormatType.JSON, params); + } + +} \ No newline at end of file diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/JsonToVectorBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/JsonToVectorBatchOp.java new file mode 100644 index 000000000..07b7111f5 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/JsonToVectorBatchOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.JsonToVectorParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class JsonToVectorBatchOp extends BaseFormatTransBatchOp + implements JsonToVectorParams { + + public JsonToVectorBatchOp() { + this(new Params()); + } + + public JsonToVectorBatchOp(Params params) { + super(FormatType.JSON, FormatType.VECTOR, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/KvToColumnsBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/KvToColumnsBatchOp.java new file mode 100644 index 000000000..3d9a393f1 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/KvToColumnsBatchOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.KvToColumnsParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class KvToColumnsBatchOp extends BaseFormatTransBatchOp + implements KvToColumnsParams { + + public KvToColumnsBatchOp() { + this(new Params()); + } + + public KvToColumnsBatchOp(Params params) { + super(FormatType.KV, FormatType.COLUMNS, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/KvToCsvBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/KvToCsvBatchOp.java new file mode 100644 index 000000000..1c84f6312 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/KvToCsvBatchOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.KvToCsvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class KvToCsvBatchOp extends BaseFormatTransBatchOp + implements KvToCsvParams { + + public KvToCsvBatchOp() { + this(new Params()); + } + + public KvToCsvBatchOp(Params params) { + super(FormatType.KV, FormatType.CSV, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/KvToJsonBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/KvToJsonBatchOp.java new file mode 100644 index 000000000..84128b01b --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/KvToJsonBatchOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.KvToJsonParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class KvToJsonBatchOp extends BaseFormatTransBatchOp + implements KvToJsonParams { + + public KvToJsonBatchOp() { + this(new Params()); + } + + public KvToJsonBatchOp(Params params) { + super(FormatType.KV, FormatType.JSON, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/KvToTripleBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/KvToTripleBatchOp.java new file mode 100644 index 000000000..a49089e7a --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/KvToTripleBatchOp.java @@ -0,0 +1,17 @@ +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.KvToTripleParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class KvToTripleBatchOp extends AnyToTripleBatchOp + implements KvToTripleParams { + + public KvToTripleBatchOp() { + this(new Params()); + } + + public KvToTripleBatchOp(Params params) { + super(FormatType.KV, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/KvToVectorBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/KvToVectorBatchOp.java new file mode 100644 index 000000000..731e7738f --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/KvToVectorBatchOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.KvToVectorParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class KvToVectorBatchOp extends BaseFormatTransBatchOp + implements KvToVectorParams { + + public KvToVectorBatchOp() { + this(new Params()); + } + + public KvToVectorBatchOp(Params params) { + super(FormatType.KV, FormatType.VECTOR, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToAnyBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToAnyBatchOp.java new file mode 100644 index 000000000..1e13546e0 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToAnyBatchOp.java @@ -0,0 +1,127 @@ +// +// Copyright (c) 2014, Alibaba Inc. +// All rights reserved. +// +// Author: Yan Huang +// Created: 6/12/18 +// Description: +// + +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.common.utils.JsonConverter; +import com.alibaba.alink.common.utils.TableUtil; +import com.alibaba.alink.operator.batch.BatchOperator; +import com.alibaba.alink.operator.common.dataproc.format.FormatTransMapper; +import com.alibaba.alink.operator.common.dataproc.format.FormatTransParams; +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.operator.common.dataproc.format.FormatWriter; +import com.alibaba.alink.params.dataproc.format.FromTripleParams; +import com.alibaba.alink.params.dataproc.format.HasHandleInvalidDefaultAsError; +import com.alibaba.alink.params.dataproc.format.HasHandleInvalidDefaultAsError.HandleInvalid; +import org.apache.commons.lang3.ArrayUtils; +import org.apache.flink.api.common.functions.MapFunction; +import org.apache.flink.api.common.functions.RichGroupReduceFunction; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.DataSet; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.tuple.Tuple3; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.ml.api.misc.param.Params; +import org.apache.flink.types.Row; +import org.apache.flink.util.Collector; + +import java.util.HashMap; + +public class TripleToAnyBatchOp> extends BatchOperator + implements FromTripleParams { + + private static final long serialVersionUID = 6283495106807306943L; + + + public TripleToAnyBatchOp(FormatType toFormat, Params params) { + super((null == params ? new Params() : params).set(FormatTransParams.TO_FORMAT, toFormat)); + } + + @Override + public T linkFrom(BatchOperator ... inputs) { + BatchOperator in = checkAndGetFirst(inputs); + + String rowColName = getTripleRowCol(); + String columnColName = getTripleColCol(); + String valueColName = getTripleValCol(); + TypeInformation rowType = TableUtil.findColType(in.getSchema(), rowColName); + DataSet > tuple3 + = in.select(new String[] {rowColName, columnColName, valueColName}) + .getDataSet() + .map( + new MapFunction >() { + @Override + public Tuple3 map(Row value) throws Exception { + return new Tuple3 <>((Comparable) value.getField(0), value.getField(1), value.getField(2)); + } + } + ); + DataSet dataSet = tuple3 + .groupBy(0) + .reduceGroup(new ToAny(getParams())); + + Tuple3 t3To + = FormatTransMapper.initFormatWriter(getParams(), null); + String[] outputColNames = t3To.f1; + TypeInformation[] outputColTypes = t3To.f2; + + this.setOutput( + dataSet, + ArrayUtils.addAll(new String[] {rowColName}, outputColNames), + ArrayUtils.addAll(new TypeInformation [] {rowType}, outputColTypes) + ); + return (T) this; + } + + public static class ToAny extends RichGroupReduceFunction, Row> { + private static final long serialVersionUID = 4128130689819716473L; + private final Params params; + FormatWriter formatWriter; + private HandleInvalid handleInvalid; + + public ToAny(Params params) { + this.params = params; + this.handleInvalid = params.get(HasHandleInvalidDefaultAsError.HANDLE_INVALID); + } + + @Override + public void open(Configuration parameters) { + formatWriter = FormatTransMapper.initFormatWriter(params, null).f0; + } + + @Override + public void reduce(Iterable > iterable, Collector out) + throws Exception { + Object rowItem = null; + HashMap bufMap = new HashMap <>(); + for (Tuple3 item : iterable) { + rowItem = item.f0; + bufMap.put(item.f1.toString(), item.f2.toString()); + } + + Tuple2 t2 = formatWriter.write(bufMap); + if (!t2.f0) { + if (handleInvalid.equals(HandleInvalid.ERROR)) { + throw new RuntimeException("Fail to convert: " + JsonConverter.toJson(bufMap)); + } else { + return; + } + } + + Row row = new Row(1 + t2.f1.getArity()); + row.setField(0, rowItem); + for (int i = 0; i < t2.f1.getArity(); i++) { + row.setField(i + 1, t2.f1.getField(i)); + } + + out.collect(row); + } + } + +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToColumnsBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToColumnsBatchOp.java new file mode 100644 index 000000000..bd4fc72d8 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToColumnsBatchOp.java @@ -0,0 +1,17 @@ +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.TripleToColumnsParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class TripleToColumnsBatchOp extends TripleToAnyBatchOp + implements TripleToColumnsParams { + + public TripleToColumnsBatchOp() { + this(new Params()); + } + + public TripleToColumnsBatchOp(Params params) { + super(FormatType.COLUMNS, params.clone()); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToCsvBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToCsvBatchOp.java new file mode 100644 index 000000000..a82ecb3bf --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToCsvBatchOp.java @@ -0,0 +1,17 @@ +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.TripleToCsvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class TripleToCsvBatchOp extends TripleToAnyBatchOp + implements TripleToCsvParams { + + public TripleToCsvBatchOp() { + this(new Params()); + } + + public TripleToCsvBatchOp(Params params) { + super(FormatType.CSV, params.clone()); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToJsonBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToJsonBatchOp.java new file mode 100644 index 000000000..598ecd5e0 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToJsonBatchOp.java @@ -0,0 +1,17 @@ +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.TripleToJsonParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class TripleToJsonBatchOp extends TripleToAnyBatchOp + implements TripleToJsonParams { + + public TripleToJsonBatchOp() { + this(new Params()); + } + + public TripleToJsonBatchOp(Params params) { + super(FormatType.JSON, params.clone()); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToKvBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToKvBatchOp.java new file mode 100644 index 000000000..e94c31f55 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToKvBatchOp.java @@ -0,0 +1,17 @@ +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.TripleToKvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class TripleToKvBatchOp extends TripleToAnyBatchOp + implements TripleToKvParams { + + public TripleToKvBatchOp() { + this(null); + } + + public TripleToKvBatchOp(Params params) { + super(FormatType.KV, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToVectorBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToVectorBatchOp.java new file mode 100644 index 000000000..5febd9496 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/TripleToVectorBatchOp.java @@ -0,0 +1,26 @@ +// +// Copyright (c) 2014, Alibaba Inc. +// All rights reserved. +// +// Author: Yan Huang +// Created: 6/12/18 +// Description: +// + +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.TripleToVectorParams; +import org.apache.flink.ml.api.misc.param.Params; + +public final class TripleToVectorBatchOp extends TripleToAnyBatchOp + implements TripleToVectorParams { + + public TripleToVectorBatchOp() { + this(new Params()); + } + + public TripleToVectorBatchOp(Params params) { + super(FormatType.VECTOR, params.clone()); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/VectorToColumnsBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/VectorToColumnsBatchOp.java new file mode 100644 index 000000000..12d613c87 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/VectorToColumnsBatchOp.java @@ -0,0 +1,19 @@ + +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.VectorToColumnsParams; +import org.apache.flink.ml.api.misc.param.Params; + + +public class VectorToColumnsBatchOp extends BaseFormatTransBatchOp + implements VectorToColumnsParams { + + public VectorToColumnsBatchOp() { + this(new Params()); + } + + public VectorToColumnsBatchOp(Params params) { + super(FormatType.VECTOR, FormatType.COLUMNS, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/VectorToCsvBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/VectorToCsvBatchOp.java new file mode 100644 index 000000000..6c7c968f3 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/VectorToCsvBatchOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.VectorToCsvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class VectorToCsvBatchOp extends BaseFormatTransBatchOp + implements VectorToCsvParams { + + public VectorToCsvBatchOp() { + this(new Params()); + } + + public VectorToCsvBatchOp(Params params) { + super(FormatType.VECTOR, FormatType.CSV, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/VectorToJsonBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/VectorToJsonBatchOp.java new file mode 100644 index 000000000..0c70d4598 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/VectorToJsonBatchOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.VectorToJsonParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class VectorToJsonBatchOp extends BaseFormatTransBatchOp + implements VectorToJsonParams { + + public VectorToJsonBatchOp() { + this(new Params()); + } + + public VectorToJsonBatchOp(Params params) { + super(FormatType.VECTOR, FormatType.JSON, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/VectorToKvBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/VectorToKvBatchOp.java new file mode 100644 index 000000000..6b8b4f716 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/VectorToKvBatchOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.VectorToKvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class VectorToKvBatchOp extends BaseFormatTransBatchOp + implements VectorToKvParams { + + public VectorToKvBatchOp() { + this(new Params()); + } + + public VectorToKvBatchOp(Params params) { + super(FormatType.VECTOR, FormatType.KV, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/VectorToTripleBatchOp.java b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/VectorToTripleBatchOp.java new file mode 100644 index 000000000..49a0cfd68 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/batch/dataproc/format/VectorToTripleBatchOp.java @@ -0,0 +1,21 @@ +package com.alibaba.alink.operator.batch.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.VectorToTripleParams; +import org.apache.flink.ml.api.misc.param.Params; + + +public final class VectorToTripleBatchOp extends AnyToTripleBatchOp + implements VectorToTripleParams { + + private static final long serialVersionUID = 7543648266815893977L; + + public VectorToTripleBatchOp() { + this(new Params()); + } + + public VectorToTripleBatchOp(Params params) { + super(FormatType.VECTOR, params); + } + +} diff --git a/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/AnyToTripleFlatMapper.java b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/AnyToTripleFlatMapper.java new file mode 100644 index 000000000..e7a5d6240 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/AnyToTripleFlatMapper.java @@ -0,0 +1,110 @@ +package com.alibaba.alink.operator.common.dataproc.format; + +import com.alibaba.alink.common.utils.JsonConverter; + +import com.alibaba.alink.params.dataproc.format.HasHandleInvalidDefaultAsError; +import com.alibaba.alink.params.dataproc.format.HasHandleInvalidDefaultAsError.*; +import com.alibaba.alink.params.dataproc.format.ToTripleParams; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.ml.api.misc.param.Params; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.types.Row; +import org.apache.flink.types.parser.FieldParser; +import org.apache.flink.util.Collector; + +import com.alibaba.alink.common.mapper.FlatMapper; +import com.alibaba.alink.common.utils.OutputColsHelper; +import com.alibaba.alink.operator.common.io.csv.CsvUtil; +import org.apache.flink.util.StringUtils; + +import java.io.Serializable; +import java.util.HashMap; +import java.util.Map; + +import static com.alibaba.alink.operator.common.dataproc.format.FormatTransMapper.initFormatReader; + +/** + */ +public class AnyToTripleFlatMapper extends FlatMapper implements Serializable { + + private static final long serialVersionUID = 3221991172531895169L; + private OutputColsHelper outputColsHelper; + private HandleInvalid handleInvalid; + private FormatReader formatReader; + + private HashMap bufMap = new HashMap <>(); + + private FieldParser [] parsers; + private boolean[] isString; + private TypeInformation[] fieldTypes; + + /** + * Constructor. + * + * @param dataSchema the dataSchema. + * @param params the params. + */ + public AnyToTripleFlatMapper(TableSchema dataSchema, Params params) { + super(dataSchema, params); + + TableSchema schema = CsvUtil.schemaStr2Schema(params.get(ToTripleParams.TRIPLE_COL_VAL_SCHEMA_STR)); + + fieldTypes = schema.getFieldTypes(); + String[] reversedCols = this.params.get(ToTripleParams.RESERVED_COLS); + this.handleInvalid = params.get(HasHandleInvalidDefaultAsError.HANDLE_INVALID); + this.outputColsHelper = new OutputColsHelper( + dataSchema, + schema.getFieldNames(), + schema.getFieldTypes(), + reversedCols + ); + } + + @Override + public void open() { + this.formatReader = initFormatReader(super.getDataSchema(), params).f0; + this.isString = new boolean[fieldTypes.length]; + this.parsers = new FieldParser[fieldTypes.length]; + + for (int i = 0; i < fieldTypes.length; i++) { + parsers[i] = ColumnsWriter.getFieldParser(fieldTypes[i].getTypeClass()); + isString[i] = fieldTypes[i].equals(Types.STRING); + } + } + + @Override + public void flatMap(Row row, Collector output) throws Exception { + if (null == row) { + output.collect(null); + } + bufMap.clear(); + boolean success = formatReader.read(row, bufMap); + if (success) { + for (Map.Entry entry : bufMap.entrySet()) { + Tuple2 parsedKey = ColumnsWriter.parseField(parsers[0], entry.getKey(), isString[0]); + Tuple2 parsedValue = ColumnsWriter.parseField(parsers[1], entry.getValue(), isString[1]); + if (!StringUtils.isNullOrWhitespaceOnly(entry.getValue())) { + if (parsedKey.f0 && parsedValue.f0) { + output.collect(outputColsHelper + .getResultRow(row, Row.of(parsedKey.f1, parsedValue.f1))); + } else if (handleInvalid.equals(HandleInvalid.ERROR)) { + throw new RuntimeException("Fail to write: " + JsonConverter.toJson(bufMap)); + } + } + } + } else if (handleInvalid.equals(HandleInvalid.ERROR)) { + throw new RuntimeException("Fail to read: " + row); + } + } + + /** + * Get the output data schema. + */ + @Override + public TableSchema getOutputSchema() { + return outputColsHelper.getResultSchema(); + } + +} diff --git a/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/ColumnsReader.java b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/ColumnsReader.java new file mode 100644 index 000000000..69f6053e0 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/ColumnsReader.java @@ -0,0 +1,28 @@ +package com.alibaba.alink.operator.common.dataproc.format; + +import org.apache.flink.types.Row; + +import java.util.Map; + +public class ColumnsReader extends FormatReader { + + final String[] colNames; + final int[] colIndices; + + public ColumnsReader(int[] colIndices, String[] colNames) { + this.colNames = colNames; + this.colIndices = colIndices; + } + + @Override + boolean read(Row row, Map out) { + for (int i = 0; i < colNames.length; i++) { + Object obj = row.getField(colIndices[i]); + if (null != obj) { + out.put(colNames[i], obj.toString()); + } + } + return true; + } + +} diff --git a/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/ColumnsWriter.java b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/ColumnsWriter.java new file mode 100644 index 000000000..6a0b8e13f --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/ColumnsWriter.java @@ -0,0 +1,90 @@ +package com.alibaba.alink.operator.common.dataproc.format; + +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.types.Row; +import org.apache.flink.types.parser.FieldParser; +import org.apache.flink.util.InstantiationUtil; +import org.apache.flink.util.StringUtils; + +import java.util.HashMap; +import java.util.Map; + +public class ColumnsWriter extends FormatWriter { + + final int nCols; + final String[] colNames; + private FieldParser [] parsers; + private boolean[] isString; + private transient Map keyToFieldIdx; + + public ColumnsWriter(TableSchema schema) { + this.nCols = schema.getFieldNames().length; + this.colNames = schema.getFieldNames(); + this.isString = new boolean[colNames.length]; + TypeInformation[] fieldTypes = schema.getFieldTypes(); + + this.parsers = new FieldParser[fieldTypes.length]; + + for (int i = 0; i < fieldTypes.length; i++) { + parsers[i] = getFieldParser(fieldTypes[i].getTypeClass()); + isString[i] = fieldTypes[i].equals(Types.STRING); + } + + keyToFieldIdx = new HashMap <>(); + for (int i = 0; i < colNames.length; i++) { + keyToFieldIdx.put(colNames[i], i); + } + } + + @Override + public Tuple2 write(Map in) { + boolean success = true; + Row row = new Row(nCols); + try { + for (Map.Entry entry : in.entrySet()) { + Integer idx = keyToFieldIdx.get(entry.getKey()); + if (null != idx) { + Tuple2 parsed = parseField(parsers[idx], entry.getValue(), isString[idx]); + if (parsed.f0) { + row.setField(idx, parsed.f1); + }else{ + success = false; + break; + } + } + } + } catch (Exception ex) { + success = false; + } + return new Tuple2 <>(success, row); + } + + static FieldParser getFieldParser(Class typeClazz) { + Class > parserType = FieldParser.getParserForType(typeClazz); + if (parserType == null) { + throw new RuntimeException("No parser available for type '" + typeClazz.getName() + "'."); + } + return InstantiationUtil.instantiate(parserType, FieldParser.class); + } + + static Tuple2 parseField(FieldParser parser, String token, boolean isStringField) { + if (isStringField) { + return Tuple2.of(true, token); + } else { + if (StringUtils.isNullOrWhitespaceOnly(token)) { + return Tuple2.of(false, null); + } + byte[] bytes = token.getBytes(); + parser.resetErrorStateAndParse(bytes, 0, bytes.length, new byte[] {0}, null); + FieldParser.ParseErrorState errorState = parser.getErrorState(); + if (errorState != FieldParser.ParseErrorState.NONE) { + return Tuple2.of(false, null); + } else { + return Tuple2.of(true, parser.getLastResult()); + } + } + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/CsvReader.java b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/CsvReader.java new file mode 100644 index 000000000..8ef910bed --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/CsvReader.java @@ -0,0 +1,37 @@ +package com.alibaba.alink.operator.common.dataproc.format; + +import com.alibaba.alink.operator.common.io.csv.CsvParser; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.types.Row; + +import java.util.Map; + +public class CsvReader extends FormatReader { + + private CsvParser parser; + private int csvColIndex; + private String[] colNames; + + public CsvReader(int csvColIndex, TableSchema schema, String fieldDelim, Character quoteChar) { + this.parser = new CsvParser(schema.getFieldTypes(), fieldDelim, quoteChar); + + this.csvColIndex = csvColIndex; + this.colNames = schema.getFieldNames(); + } + + @Override + boolean read(Row row, Map out) { + String line = (String) row.getField(csvColIndex); + Tuple2 parsed = parser.parse(line); + + for (int i = 0; i < parsed.f1.getArity(); i++) { + if (parsed.f1.getField(i) != null) { + out.put(colNames[i], String.valueOf(parsed.f1.getField(i))); + } else { + out.put(colNames[i], null); + } + } + return parsed.f0; + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/CsvWriter.java b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/CsvWriter.java new file mode 100644 index 000000000..96ef1c635 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/CsvWriter.java @@ -0,0 +1,57 @@ +package com.alibaba.alink.operator.common.dataproc.format; + +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.types.Row; + +import java.util.Map; + +public class CsvWriter extends FormatWriter { + + private final String fieldDelim; + private final String quoteString; + private final String escapedQuote; + private final boolean enableQuote; + final String[] colNames; + + public CsvWriter(TableSchema schema, String fieldDelim, Character quoteChar) { + + this.colNames = schema.getFieldNames(); + this.fieldDelim = fieldDelim; + this.enableQuote = quoteChar != null; + if (enableQuote) { + this.quoteString = quoteChar.toString(); + this.escapedQuote = this.quoteString + this.quoteString; + } else { + this.quoteString = null; + this.escapedQuote = null; + } + } + + @Override + public Tuple2 write(Map in) { + StringBuilder sbd = new StringBuilder(); + for (int i = 0; i < colNames.length; i++) { + if (i > 0) { + sbd.append(fieldDelim); + } + String v = in.get(colNames[i]); + if (v == null) { + continue; + } + if (quoteString != null) { + if (v.isEmpty() || v.contains(fieldDelim) || v.contains(quoteString)) { + sbd.append(quoteString); + sbd.append(v.replace(quoteString, this.escapedQuote + quoteString)); + sbd.append(quoteString); + } else { + sbd.append(v); + } + } else { + sbd.append(v); + } + } + + return new Tuple2 <>(true, Row.of(sbd.toString())); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/FormatReader.java b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/FormatReader.java new file mode 100644 index 000000000..6875d1275 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/FormatReader.java @@ -0,0 +1,11 @@ +package com.alibaba.alink.operator.common.dataproc.format; + +import org.apache.flink.types.Row; + +import java.io.Serializable; +import java.util.Map; + +abstract class FormatReader implements Serializable { + + abstract boolean read(Row row, Map out); +} diff --git a/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/FormatTransMapper.java b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/FormatTransMapper.java new file mode 100644 index 000000000..b215303ae --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/FormatTransMapper.java @@ -0,0 +1,218 @@ +package com.alibaba.alink.operator.common.dataproc.format; + +import com.alibaba.alink.common.mapper.Mapper; +import com.alibaba.alink.common.utils.JsonConverter; +import com.alibaba.alink.common.utils.OutputColsHelper; +import com.alibaba.alink.common.utils.TableUtil; +import com.alibaba.alink.operator.common.io.csv.CsvUtil; +import com.alibaba.alink.params.dataproc.format.*; +import com.alibaba.alink.params.io.HasSchemaStr; +import com.alibaba.alink.params.shared.colname.HasReservedColsDefaultAsNull; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.tuple.Tuple3; +import org.apache.flink.ml.api.misc.param.Params; +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.types.Row; + +import java.util.HashMap; +import java.util.Map; + +/** + */ +public class FormatTransMapper extends Mapper { + + private OutputColsHelper outputColsHelper; + private HasHandleInvalidDefaultAsError.HandleInvalid handleInvalid; + private transient FormatReader formatReader; + private transient FormatWriter formatWriter; + + /** + * Constructor. + * + * @param dataSchema the dataSchema. + * @param params the params. + */ + public FormatTransMapper(TableSchema dataSchema, Params params) { + super(dataSchema, params); + + Tuple2 t2From = initFormatReader(dataSchema, params); + this.formatReader = t2From.f0; + String[] fromColNames = t2From.f1; + + Tuple3 t3To = initFormatWriter(params, fromColNames); + formatWriter = t3To.f0; + String[] outputColNames = t3To.f1; + TypeInformation[] outputColTypes = t3To.f2; + + this.handleInvalid = params.get(HasHandleInvalidDefaultAsError.HANDLE_INVALID); + this.outputColsHelper = new OutputColsHelper(dataSchema, outputColNames, outputColTypes, + this.params.get(HasReservedColsDefaultAsNull.RESERVED_COLS)); + } + + @Override + public void open() { + Tuple2 t2From = initFormatReader(super.getDataSchema(), params); + this.formatReader = t2From.f0; + String[] fromColNames = t2From.f1; + + Tuple3 t3To = initFormatWriter(params, fromColNames); + formatWriter = t3To.f0; + } + + public static Tuple2 initFormatReader(TableSchema dataSchema, Params params) { + FormatReader formatReader; + String[] fromColNames; + + FormatType fromFormat = params.get(FormatTransParams.FROM_FORMAT); + switch (fromFormat) { + case KV: + String kvColName = params.get(FromKvParams.KV_COL); + int kvColIndex = TableUtil.findColIndexWithAssertAndHint(dataSchema.getFieldNames(), kvColName); + formatReader = new KvReader( + kvColIndex, + params.get(FromKvParams.KV_COL_DELIMITER), + params.get(FromKvParams.KV_VAL_DELIMITER) + ); + fromColNames = null; + break; + case CSV: + String csvColName = params.get(FromCsvParams.CSV_COL); + int csvColIndex = TableUtil.findColIndexWithAssertAndHint(dataSchema.getFieldNames(), csvColName); + TableSchema fromCsvSchema = CsvUtil.schemaStr2Schema(params.get(FromCsvParams.SCHEMA_STR)); + formatReader = new CsvReader( + csvColIndex, + fromCsvSchema, + params.get(FromCsvParams.CSV_FIELD_DELIMITER), + params.get(FromCsvParams.QUOTE_CHAR) + ); + fromColNames = fromCsvSchema.getFieldNames(); + break; + case VECTOR: + String vectorColName = params.get(FromVectorParams.VECTOR_COL); + int vectorColIndex = TableUtil.findColIndexWithAssertAndHint(dataSchema.getFieldNames(), + vectorColName); + if (params.contains(HasSchemaStr.SCHEMA_STR)) { + formatReader = new VectorReader( + vectorColIndex, + CsvUtil.schemaStr2Schema(params.get(HasSchemaStr.SCHEMA_STR)) + ); + } else { + formatReader = new VectorReader(vectorColIndex, null); + } + fromColNames = null; + break; + case JSON: + String jsonColName = params.get(FromJsonParams.JSON_COL); + int jsonColIndex = TableUtil.findColIndexWithAssertAndHint(dataSchema.getFieldNames(), jsonColName); + formatReader = new JsonReader(jsonColIndex); + fromColNames = null; + break; + case COLUMNS: + fromColNames = params.get(FromColumnsParams.SELECTED_COLS); + if (null == fromColNames) { + fromColNames = dataSchema.getFieldNames(); + } + int[] colIndices = TableUtil.findColIndicesWithAssertAndHint(dataSchema.getFieldNames(), fromColNames); + formatReader = new ColumnsReader(colIndices, fromColNames); + break; + default: + throw new IllegalArgumentException("Can not translate this type : " + fromFormat); + } + + return new Tuple2<>(formatReader, fromColNames); + } + + public static Tuple3 initFormatWriter(Params params, + String[] fromColNames) { + FormatType toFormat = params.get(FormatTransParams.TO_FORMAT); + FormatWriter formatWriter; + String[] outputColNames; + TypeInformation[] outputColTypes; + + switch (toFormat) { + case COLUMNS: + TableSchema schema = CsvUtil.schemaStr2Schema(params.get(ToColumnsParams.SCHEMA_STR)); + formatWriter = new ColumnsWriter(schema); + outputColNames = schema.getFieldNames(); + outputColTypes = schema.getFieldTypes(); + break; + case JSON: + formatWriter = new JsonWriter(); + outputColNames = new String[]{params.get(ToJsonParams.JSON_COL)}; + outputColTypes = new TypeInformation[]{Types.STRING}; + break; + case KV: + formatWriter = new KvWriter( + params.get(ToKvParams.KV_COL_DELIMITER), + params.get(ToKvParams.KV_VAL_DELIMITER) + ); + outputColNames = new String[]{params.get(ToKvParams.KV_COL)}; + outputColTypes = new TypeInformation[]{Types.STRING}; + break; + case CSV: + formatWriter = new CsvWriter( + CsvUtil.schemaStr2Schema(params.get(ToCsvParams.SCHEMA_STR)), + params.get(ToCsvParams.CSV_FIELD_DELIMITER), + params.get(ToCsvParams.QUOTE_CHAR) + ); + outputColNames = new String[]{params.get(ToCsvParams.CSV_COL)}; + outputColTypes = new TypeInformation[]{Types.STRING}; + break; + case VECTOR: + formatWriter = new VectorWriter( + params.get(ToVectorParams.VECTOR_SIZE), + fromColNames + ); + outputColNames = new String[]{params.get(ToVectorParams.VECTOR_COL)}; + outputColTypes = new TypeInformation[]{Types.STRING}; + break; + default: + throw new IllegalArgumentException("Can not translate to this type : " + toFormat); + } + + return new Tuple3<>(formatWriter, outputColNames, outputColTypes); + + } + + /** + * The operation function to transform vector to table columns. + * + * @param row the input Row type data + * @return the output row. + */ + @Override + public Row map(Row row) { + if (null == row) { + return null; + } + Map bufMap = new HashMap<>(); + boolean success = formatReader.read(row, bufMap); + if (!success && handleInvalid.equals(HasHandleInvalidDefaultAsError.HandleInvalid.ERROR)) { + throw new RuntimeException("Fail to read: " + row); + } + Tuple2 result = formatWriter.write(bufMap); + if (!result.f0 && handleInvalid.equals(HasHandleInvalidDefaultAsError.HandleInvalid.ERROR)) { + throw new RuntimeException("Fail to write: " + JsonConverter.toJson(bufMap)); + } + if (params.get(FormatTransParams.FROM_FORMAT).equals(FormatType.VECTOR) && + params.get(FormatTransParams.TO_FORMAT).equals(FormatType.COLUMNS)) { + int length = result.f1.getArity(); + for (int i = 0; i < length; i++) { + if (result.f1.getField(i) == null) { + result.f1.setField(i, 0.0); + } + } + } + return outputColsHelper.getResultRow(row, result.f1); + } + + /** + * Get the output data schema. + */ + @Override + public TableSchema getOutputSchema() { + return outputColsHelper.getResultSchema(); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/FormatTransParams.java b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/FormatTransParams.java new file mode 100644 index 000000000..97e6cd726 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/FormatTransParams.java @@ -0,0 +1,20 @@ +package com.alibaba.alink.operator.common.dataproc.format; + +import org.apache.flink.ml.api.misc.param.ParamInfo; +import org.apache.flink.ml.api.misc.param.ParamInfoFactory; + +public interface FormatTransParams { + + ParamInfo FROM_FORMAT = ParamInfoFactory + .createParamInfo("fromFormat", FormatType.class) + .setDescription("the format type of trans from") + .setRequired() + .build(); + + ParamInfo TO_FORMAT = ParamInfoFactory + .createParamInfo("toFormat", FormatType.class) + .setDescription("the format type of trans to") + .setRequired() + .build(); + +} \ No newline at end of file diff --git a/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/FormatType.java b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/FormatType.java new file mode 100644 index 000000000..a1d2e407a --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/FormatType.java @@ -0,0 +1,9 @@ +package com.alibaba.alink.operator.common.dataproc.format; + +public enum FormatType { + KV, + VECTOR, + COLUMNS, + JSON, + CSV +} diff --git a/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/FormatWriter.java b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/FormatWriter.java new file mode 100644 index 000000000..b6ba56314 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/FormatWriter.java @@ -0,0 +1,13 @@ +package com.alibaba.alink.operator.common.dataproc.format; + +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.types.Row; + +import java.io.Serializable; +import java.util.Map; + +public abstract class FormatWriter implements Serializable { + + public abstract Tuple2 write(Map in); + +} diff --git a/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/JsonReader.java b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/JsonReader.java new file mode 100644 index 000000000..6b7f0da73 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/JsonReader.java @@ -0,0 +1,29 @@ +package com.alibaba.alink.operator.common.dataproc.format; + +import org.apache.flink.types.Row; + +import com.alibaba.alink.common.utils.JsonConverter; + +import java.util.Map; + +public class JsonReader extends FormatReader { + + final int jsonColIndex; + + public JsonReader(int jsonColIndex) { + this.jsonColIndex = jsonColIndex; + } + + @Override + boolean read(Row row, Map out) { + String line = (String) row.getField(jsonColIndex); + + Map map = JsonConverter.fromJson(line, Map.class); + + map.forEach((key, value) -> { + out.put(key.toString(), value.toString()); + + }); + return true; + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/JsonWriter.java b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/JsonWriter.java new file mode 100644 index 000000000..607617d4f --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/JsonWriter.java @@ -0,0 +1,18 @@ +package com.alibaba.alink.operator.common.dataproc.format; + +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.types.Row; + +import com.alibaba.alink.common.utils.JsonConverter; + +import java.util.Map; + +public class JsonWriter extends FormatWriter { + + public JsonWriter() {} + + @Override + public Tuple2 write(Map in) { + return new Tuple2 <>(true, Row.of(JsonConverter.toJson(in))); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/KvReader.java b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/KvReader.java new file mode 100644 index 000000000..380270278 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/KvReader.java @@ -0,0 +1,38 @@ +package com.alibaba.alink.operator.common.dataproc.format; + +import org.apache.flink.types.Row; +import org.apache.flink.util.StringUtils; + +import java.util.Map; + +public class KvReader extends FormatReader { + + final int kvColIndex; + final String colDelimiter; + final String valDelimiter; + + public KvReader(int kvColIndex, String colDelimiter, String valDelimiter) { + this.kvColIndex = kvColIndex; + this.colDelimiter = colDelimiter; + this.valDelimiter = valDelimiter; + } + + @Override + boolean read(Row row, Map out) { + String line = (String) row.getField(kvColIndex); + String[] fields = line.split(colDelimiter); + + for (int i = 0; i < fields.length; i++) { + if (StringUtils.isNullOrWhitespaceOnly(fields[i])) { + return false; + } + String[] kv = fields[i].split(valDelimiter); + if (kv.length != 2) { + return false; + } + out.put(kv[0], kv[1]); + } + + return true; + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/KvWriter.java b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/KvWriter.java new file mode 100644 index 000000000..b99abcfb3 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/KvWriter.java @@ -0,0 +1,33 @@ +package com.alibaba.alink.operator.common.dataproc.format; + +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.types.Row; + +import java.util.Map; + +public class KvWriter extends FormatWriter { + + final String colDelimiter; + final String valDelimiter; + + public KvWriter(String colDelimiter, String valDelimiter) { + this.colDelimiter = colDelimiter; + this.valDelimiter = valDelimiter; + } + + @Override + public Tuple2 write(Map in) { + StringBuilder sbd = new StringBuilder(); + boolean isFirstPair = true; + for (Map.Entry entry : in.entrySet()) { + if (isFirstPair) { + isFirstPair = false; + } else { + sbd.append(colDelimiter); + } + sbd.append(entry.getKey() + valDelimiter + entry.getValue()); + } + + return new Tuple2 <>(true, Row.of(sbd.toString())); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/VectorReader.java b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/VectorReader.java new file mode 100644 index 000000000..584a8def1 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/VectorReader.java @@ -0,0 +1,59 @@ +package com.alibaba.alink.operator.common.dataproc.format; + +import org.apache.flink.table.api.TableSchema; +import org.apache.flink.types.Row; + +import com.alibaba.alink.common.linalg.DenseVector; +import com.alibaba.alink.common.linalg.SparseVector; +import com.alibaba.alink.common.linalg.Vector; +import com.alibaba.alink.common.linalg.VectorUtil; + +import java.util.Map; + +public class VectorReader extends FormatReader { + + final int vecColIndex; + final String[] colNames; + + public VectorReader(int vecColIndex, TableSchema schema) { + this.vecColIndex = vecColIndex; + if (null == schema) { + this.colNames = null; + } else { + this.colNames = schema.getFieldNames(); + } + } + + @Override + boolean read(Row row, Map out) { + Vector vec = VectorUtil.getVector(row.getField(vecColIndex)); + if (vec instanceof DenseVector) { + DenseVector denseVector = (DenseVector) vec; + if (null == colNames) { + for (int i = 0; i < denseVector.size(); i++) { + out.put(String.valueOf(i), String.valueOf(denseVector.get(i))); + } + } else { + int nCol = Math.min(colNames.length, denseVector.size()); + for (int i = 0; i < denseVector.size(); i++) { + out.put(colNames[i], String.valueOf(denseVector.get(i))); + } + } + } else { + SparseVector sparseVector = (SparseVector) vec; + if (null == colNames) { + for (int i : sparseVector.getIndices()) { + out.put(String.valueOf(i), String.valueOf(sparseVector.get(i))); + } + } else { + for (int i : sparseVector.getIndices()) { + if (i < colNames.length) { + out.put(colNames[i], String.valueOf(sparseVector.get(i))); + } + } + } + + } + return true; + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/VectorWriter.java b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/VectorWriter.java new file mode 100644 index 000000000..e1f9bd18d --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/common/dataproc/format/VectorWriter.java @@ -0,0 +1,72 @@ +package com.alibaba.alink.operator.common.dataproc.format; + +import com.alibaba.alink.common.linalg.SparseVector; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.types.Row; + +import java.util.Map; + +public class VectorWriter extends FormatWriter { + + final long size; + final String[] colNames; + + public VectorWriter(long size, String[] colNames) { + this.size = size; + this.colNames = colNames; + } + + @Override + public Tuple2 write(Map in) { + if (null == this.colNames) { +// StringBuilder sbd = new StringBuilder(); +// +// if (this.size > 0) { +// sbd.append("$").append(this.size).append("$"); +// } +// +// boolean isFirstPair = true; +// for (Map.Entry entry : in.entrySet()) { +// if (isFirstPair) { +// isFirstPair = false; +// } else { +// sbd.append(" "); +// } +// sbd.append(entry.getKey() + ":" + entry.getValue()); +// } + + int itemSize = in.size(); + int[] indices = new int[itemSize]; + double[] values = new double[itemSize]; + int count = 0; + for (Map.Entry entry : in.entrySet()) { + indices[count] = Integer.parseInt(entry.getKey()); + values[count] = Double.parseDouble(entry.getValue()); + count++; + } + + return new Tuple2 <>(true, Row.of(new SparseVector((int) this.size, indices, values).toString())); + + } else { + StringBuilder sbd = new StringBuilder(); + + int n = colNames.length; + if (this.size > colNames.length ) { + sbd.append("$").append(this.size).append("$"); + }else if(this.size > 0 && this.size 0) { + sbd.append(" "); + } + String v = in.get(colNames[i]); + sbd.append(v); + } + + return new Tuple2 <>(true, Row.of(sbd.toString())); + + } + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/AnyToTripleStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/AnyToTripleStreamOp.java new file mode 100644 index 000000000..6e8f1d41c --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/AnyToTripleStreamOp.java @@ -0,0 +1,27 @@ +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.AnyToTripleFlatMapper; +import com.alibaba.alink.operator.common.dataproc.format.FormatTransParams; +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.operator.stream.utils.FlatMapStreamOp; +import com.alibaba.alink.params.dataproc.format.ToTripleParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class AnyToTripleStreamOp> extends FlatMapStreamOp + implements ToTripleParams { + + public AnyToTripleStreamOp() { + this(null); + } + + public AnyToTripleStreamOp(FormatType formatType, Params params) { + this( + (null == params ? new Params() : params) + .set(FormatTransParams.FROM_FORMAT, formatType) + ); + } + + public AnyToTripleStreamOp(Params params) { + super(AnyToTripleFlatMapper::new, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/BaseFormatTransStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/BaseFormatTransStreamOp.java new file mode 100644 index 000000000..812c2b677 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/BaseFormatTransStreamOp.java @@ -0,0 +1,29 @@ +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatTransMapper; +import com.alibaba.alink.operator.common.dataproc.format.FormatTransParams; +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.operator.stream.utils.MapStreamOp; +import org.apache.flink.ml.api.misc.param.Params; + +/** + * Transform vector to table columns. This transformer will map vector column to columns as designed. + */ +public class BaseFormatTransStreamOp> extends MapStreamOp { + + private BaseFormatTransStreamOp() { + this(null); + } + + public BaseFormatTransStreamOp(FormatType fromFormat, FormatType toFormat, Params params) { + this( + (null == params ? new Params() : params) + .set(FormatTransParams.FROM_FORMAT, fromFormat) + .set(FormatTransParams.TO_FORMAT, toFormat) + ); + } + + private BaseFormatTransStreamOp(Params params) { + super(FormatTransMapper::new, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/ColumnsToCsvStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/ColumnsToCsvStreamOp.java new file mode 100644 index 000000000..a274a6129 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/ColumnsToCsvStreamOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.ColumnsToCsvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class ColumnsToCsvStreamOp extends BaseFormatTransStreamOp + implements ColumnsToCsvParams { + + public ColumnsToCsvStreamOp() { + this(new Params()); + } + + public ColumnsToCsvStreamOp(Params params) { + super(FormatType.COLUMNS, FormatType.CSV, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/ColumnsToJsonStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/ColumnsToJsonStreamOp.java new file mode 100644 index 000000000..50ea9b921 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/ColumnsToJsonStreamOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.ColumnsToJsonParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class ColumnsToJsonStreamOp extends BaseFormatTransStreamOp + implements ColumnsToJsonParams { + + public ColumnsToJsonStreamOp() { + this(new Params()); + } + + public ColumnsToJsonStreamOp(Params params) { + super(FormatType.COLUMNS, FormatType.JSON, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/ColumnsToKvStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/ColumnsToKvStreamOp.java new file mode 100644 index 000000000..e76d787c1 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/ColumnsToKvStreamOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.ColumnsToKvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class ColumnsToKvStreamOp extends BaseFormatTransStreamOp + implements ColumnsToKvParams { + + public ColumnsToKvStreamOp() { + this(new Params()); + } + + public ColumnsToKvStreamOp(Params params) { + super(FormatType.COLUMNS, FormatType.KV, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/ColumnsToTripleStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/ColumnsToTripleStreamOp.java new file mode 100644 index 000000000..0fcce3520 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/ColumnsToTripleStreamOp.java @@ -0,0 +1,16 @@ +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.ColumnsToTripleParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class ColumnsToTripleStreamOp extends AnyToTripleStreamOp + implements ColumnsToTripleParams { + public ColumnsToTripleStreamOp() { + this(new Params()); + } + + public ColumnsToTripleStreamOp(Params params) { + super(FormatType.COLUMNS, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/ColumnsToVectorStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/ColumnsToVectorStreamOp.java new file mode 100644 index 000000000..b96b8884f --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/ColumnsToVectorStreamOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.ColumnsToVectorParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class ColumnsToVectorStreamOp extends BaseFormatTransStreamOp + implements ColumnsToVectorParams { + + public ColumnsToVectorStreamOp() { + this(new Params()); + } + + public ColumnsToVectorStreamOp(Params params) { + super(FormatType.COLUMNS, FormatType.VECTOR, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/CsvToColumnsStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/CsvToColumnsStreamOp.java new file mode 100644 index 000000000..e0e564375 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/CsvToColumnsStreamOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.CsvToColumnsParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class CsvToColumnsStreamOp extends BaseFormatTransStreamOp + implements CsvToColumnsParams { + + public CsvToColumnsStreamOp() { + this(new Params()); + } + + public CsvToColumnsStreamOp(Params params) { + super(FormatType.CSV, FormatType.COLUMNS, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/CsvToJsonStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/CsvToJsonStreamOp.java new file mode 100644 index 000000000..836aeea66 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/CsvToJsonStreamOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.CsvToJsonParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class CsvToJsonStreamOp extends BaseFormatTransStreamOp + implements CsvToJsonParams { + + public CsvToJsonStreamOp() { + this(new Params()); + } + + public CsvToJsonStreamOp(Params params) { + super(FormatType.CSV, FormatType.JSON, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/CsvToKvStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/CsvToKvStreamOp.java new file mode 100644 index 000000000..8ab4c5b65 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/CsvToKvStreamOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.CsvToKvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class CsvToKvStreamOp extends BaseFormatTransStreamOp + implements CsvToKvParams { + + public CsvToKvStreamOp() { + this(new Params()); + } + + public CsvToKvStreamOp(Params params) { + super(FormatType.CSV, FormatType.KV, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/CsvToTripleStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/CsvToTripleStreamOp.java new file mode 100644 index 000000000..607c10fee --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/CsvToTripleStreamOp.java @@ -0,0 +1,16 @@ +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.CsvToTripleParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class CsvToTripleStreamOp extends AnyToTripleStreamOp + implements CsvToTripleParams { + public CsvToTripleStreamOp() { + this(new Params()); + } + + public CsvToTripleStreamOp(Params params) { + super(FormatType.CSV, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/CsvToVectorStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/CsvToVectorStreamOp.java new file mode 100644 index 000000000..635852e73 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/CsvToVectorStreamOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.CsvToVectorParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class CsvToVectorStreamOp extends BaseFormatTransStreamOp + implements CsvToVectorParams { + + public CsvToVectorStreamOp() { + this(new Params()); + } + + public CsvToVectorStreamOp(Params params) { + super(FormatType.CSV, FormatType.VECTOR, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/JsonToColumnsStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/JsonToColumnsStreamOp.java new file mode 100644 index 000000000..1a4b76650 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/JsonToColumnsStreamOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.JsonToColumnsParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class JsonToColumnsStreamOp extends BaseFormatTransStreamOp + implements JsonToColumnsParams { + + public JsonToColumnsStreamOp() { + this(new Params()); + } + + public JsonToColumnsStreamOp(Params params) { + super(FormatType.JSON, FormatType.COLUMNS, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/JsonToCsvStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/JsonToCsvStreamOp.java new file mode 100644 index 000000000..e10cc380b --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/JsonToCsvStreamOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.JsonToCsvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class JsonToCsvStreamOp extends BaseFormatTransStreamOp + implements JsonToCsvParams { + + public JsonToCsvStreamOp() { + this(new Params()); + } + + public JsonToCsvStreamOp(Params params) { + super(FormatType.JSON, FormatType.CSV, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/JsonToKvStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/JsonToKvStreamOp.java new file mode 100644 index 000000000..b243c06f4 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/JsonToKvStreamOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.JsonToKvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class JsonToKvStreamOp extends BaseFormatTransStreamOp + implements JsonToKvParams { + + public JsonToKvStreamOp() { + this(new Params()); + } + + public JsonToKvStreamOp(Params params) { + super(FormatType.JSON, FormatType.KV, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/JsonToTripleStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/JsonToTripleStreamOp.java new file mode 100644 index 000000000..8652b27d1 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/JsonToTripleStreamOp.java @@ -0,0 +1,16 @@ +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.JsonToTripleParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class JsonToTripleStreamOp extends AnyToTripleStreamOp + implements JsonToTripleParams { + public JsonToTripleStreamOp() { + this(new Params()); + } + + public JsonToTripleStreamOp(Params params) { + super(FormatType.JSON, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/JsonToVectorStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/JsonToVectorStreamOp.java new file mode 100644 index 000000000..d680c4566 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/JsonToVectorStreamOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.JsonToVectorParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class JsonToVectorStreamOp extends BaseFormatTransStreamOp + implements JsonToVectorParams { + + public JsonToVectorStreamOp() { + this(new Params()); + } + + public JsonToVectorStreamOp(Params params) { + super(FormatType.JSON, FormatType.VECTOR, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/KvToColumnsStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/KvToColumnsStreamOp.java new file mode 100644 index 000000000..7fbdb8f0b --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/KvToColumnsStreamOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.KvToColumnsParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class KvToColumnsStreamOp extends BaseFormatTransStreamOp + implements KvToColumnsParams { + + public KvToColumnsStreamOp() { + this(new Params()); + } + + public KvToColumnsStreamOp(Params params) { + super(FormatType.KV, FormatType.COLUMNS, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/KvToCsvStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/KvToCsvStreamOp.java new file mode 100644 index 000000000..998a79527 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/KvToCsvStreamOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.KvToCsvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class KvToCsvStreamOp extends BaseFormatTransStreamOp + implements KvToCsvParams { + + public KvToCsvStreamOp() { + this(new Params()); + } + + public KvToCsvStreamOp(Params params) { + super(FormatType.KV, FormatType.CSV, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/KvToJsonStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/KvToJsonStreamOp.java new file mode 100644 index 000000000..05f05788b --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/KvToJsonStreamOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.KvToJsonParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class KvToJsonStreamOp extends BaseFormatTransStreamOp + implements KvToJsonParams { + + public KvToJsonStreamOp() { + this(new Params()); + } + + public KvToJsonStreamOp(Params params) { + super(FormatType.KV, FormatType.JSON, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/KvToTripleStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/KvToTripleStreamOp.java new file mode 100644 index 000000000..0479b79bd --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/KvToTripleStreamOp.java @@ -0,0 +1,16 @@ +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.KvToTripleParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class KvToTripleStreamOp extends AnyToTripleStreamOp + implements KvToTripleParams { + public KvToTripleStreamOp() { + this(new Params()); + } + + public KvToTripleStreamOp(Params params) { + super(FormatType.KV, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/KvToVectorStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/KvToVectorStreamOp.java new file mode 100644 index 000000000..1fb91a3ac --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/KvToVectorStreamOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.KvToVectorParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class KvToVectorStreamOp extends BaseFormatTransStreamOp + implements KvToVectorParams { + + public KvToVectorStreamOp() { + this(new Params()); + } + + public KvToVectorStreamOp(Params params) { + super(FormatType.KV, FormatType.VECTOR, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/VectorToColumnsStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/VectorToColumnsStreamOp.java new file mode 100644 index 000000000..eb2ce974a --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/VectorToColumnsStreamOp.java @@ -0,0 +1,20 @@ + +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; + +import com.alibaba.alink.params.dataproc.format.VectorToColumnsParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class VectorToColumnsStreamOp extends BaseFormatTransStreamOp + implements VectorToColumnsParams { + + public VectorToColumnsStreamOp() { + this(new Params()); + } + + public VectorToColumnsStreamOp(Params params) { + super(FormatType.VECTOR, FormatType.COLUMNS, params); + } + +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/VectorToCsvStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/VectorToCsvStreamOp.java new file mode 100644 index 000000000..619c1cd8f --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/VectorToCsvStreamOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.VectorToCsvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class VectorToCsvStreamOp extends BaseFormatTransStreamOp + implements VectorToCsvParams { + + public VectorToCsvStreamOp() { + this(new Params()); + } + + public VectorToCsvStreamOp(Params params) { + super(FormatType.VECTOR, FormatType.CSV, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/VectorToJsonStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/VectorToJsonStreamOp.java new file mode 100644 index 000000000..753bbb5c9 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/VectorToJsonStreamOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.VectorToJsonParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class VectorToJsonStreamOp extends BaseFormatTransStreamOp + implements VectorToJsonParams { + + public VectorToJsonStreamOp() { + this(new Params()); + } + + public VectorToJsonStreamOp(Params params) { + super(FormatType.VECTOR, FormatType.JSON, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/VectorToKvStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/VectorToKvStreamOp.java new file mode 100644 index 000000000..01ea79a27 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/VectorToKvStreamOp.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.VectorToKvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class VectorToKvStreamOp extends BaseFormatTransStreamOp + implements VectorToKvParams { + + public VectorToKvStreamOp() { + this(new Params()); + } + + public VectorToKvStreamOp(Params params) { + super(FormatType.VECTOR, FormatType.KV, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/VectorToTripleStreamOp.java b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/VectorToTripleStreamOp.java new file mode 100644 index 000000000..76bbe84f3 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/operator/stream/dataproc/format/VectorToTripleStreamOp.java @@ -0,0 +1,16 @@ +package com.alibaba.alink.operator.stream.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.VectorToTripleParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class VectorToTripleStreamOp extends AnyToTripleStreamOp + implements VectorToTripleParams { + public VectorToTripleStreamOp() { + this(new Params()); + } + + public VectorToTripleStreamOp(Params params) { + super(FormatType.VECTOR, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/ColumnsToCsvParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/ColumnsToCsvParams.java new file mode 100644 index 000000000..687f14864 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/ColumnsToCsvParams.java @@ -0,0 +1,8 @@ + +package com.alibaba.alink.params.dataproc.format; + +public interface ColumnsToCsvParams extends + ToCsvParams, + FromColumnsParams, + HasHandleInvalidDefaultAsError { +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/ColumnsToJsonParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/ColumnsToJsonParams.java new file mode 100644 index 000000000..3609efb62 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/ColumnsToJsonParams.java @@ -0,0 +1,8 @@ + +package com.alibaba.alink.params.dataproc.format; + +public interface ColumnsToJsonParams extends + ToJsonParams, + FromColumnsParams, + HasHandleInvalidDefaultAsError { +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/ColumnsToKvParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/ColumnsToKvParams.java new file mode 100644 index 000000000..34cd35264 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/ColumnsToKvParams.java @@ -0,0 +1,8 @@ + +package com.alibaba.alink.params.dataproc.format; + +public interface ColumnsToKvParams extends + ToKvParams, + FromColumnsParams, + HasHandleInvalidDefaultAsError { +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/ColumnsToTripleParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/ColumnsToTripleParams.java new file mode 100644 index 000000000..75a3b2350 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/ColumnsToTripleParams.java @@ -0,0 +1,5 @@ +package com.alibaba.alink.params.dataproc.format; + +public interface ColumnsToTripleParams extends + FromColumnsParams , + HasHandleInvalidDefaultAsError {} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/ColumnsToVectorParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/ColumnsToVectorParams.java new file mode 100644 index 000000000..4ef9e0ca3 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/ColumnsToVectorParams.java @@ -0,0 +1,8 @@ + +package com.alibaba.alink.params.dataproc.format; + +public interface ColumnsToVectorParams extends + ToVectorParams, + FromColumnsParams, + HasHandleInvalidDefaultAsError { +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/CsvToColumnsParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/CsvToColumnsParams.java new file mode 100644 index 000000000..fd25067c5 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/CsvToColumnsParams.java @@ -0,0 +1,8 @@ + +package com.alibaba.alink.params.dataproc.format; + +public interface CsvToColumnsParams extends + ToColumnsParams, + FromCsvParams, + HasHandleInvalidDefaultAsError { +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/CsvToJsonParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/CsvToJsonParams.java new file mode 100644 index 000000000..f4a79f439 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/CsvToJsonParams.java @@ -0,0 +1,8 @@ + +package com.alibaba.alink.params.dataproc.format; + +public interface CsvToJsonParams extends + ToJsonParams, + FromCsvParams, + HasHandleInvalidDefaultAsError { +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/CsvToKvParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/CsvToKvParams.java new file mode 100644 index 000000000..34541d7f3 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/CsvToKvParams.java @@ -0,0 +1,8 @@ + +package com.alibaba.alink.params.dataproc.format; + +public interface CsvToKvParams extends + ToKvParams, + FromCsvParams, + HasHandleInvalidDefaultAsError { +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/CsvToTripleParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/CsvToTripleParams.java new file mode 100644 index 000000000..a26bd2c36 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/CsvToTripleParams.java @@ -0,0 +1,6 @@ +package com.alibaba.alink.params.dataproc.format; + +public interface CsvToTripleParams extends + FromCsvParams, + HasHandleInvalidDefaultAsError { +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/CsvToVectorParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/CsvToVectorParams.java new file mode 100644 index 000000000..ab83a1380 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/CsvToVectorParams.java @@ -0,0 +1,8 @@ + +package com.alibaba.alink.params.dataproc.format; + +public interface CsvToVectorParams extends + ToVectorParams, + FromCsvParams, + HasHandleInvalidDefaultAsError { +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/FromColumnsParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/FromColumnsParams.java new file mode 100644 index 000000000..c22ed7387 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/FromColumnsParams.java @@ -0,0 +1,7 @@ +package com.alibaba.alink.params.dataproc.format; + +import com.alibaba.alink.params.shared.colname.HasSelectedColsDefaultAsNull; + +public interface FromColumnsParams extends + HasSelectedColsDefaultAsNull { +} \ No newline at end of file diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/FromCsvParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/FromCsvParams.java new file mode 100644 index 000000000..7c2e838d5 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/FromCsvParams.java @@ -0,0 +1,11 @@ +package com.alibaba.alink.params.dataproc.format; + +import com.alibaba.alink.params.io.HasQuoteCharDefaultAsDoubleQuote; +import com.alibaba.alink.params.io.HasSchemaStr; + +public interface FromCsvParams extends + HasCsvCol , + HasSchemaStr , + HasCsvFieldDelimiterDefaultAsComma , + HasQuoteCharDefaultAsDoubleQuote { +} \ No newline at end of file diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/FromJsonParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/FromJsonParams.java new file mode 100644 index 000000000..f9343d61c --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/FromJsonParams.java @@ -0,0 +1,5 @@ +package com.alibaba.alink.params.dataproc.format; + +public interface FromJsonParams extends + HasJsonCol { +} \ No newline at end of file diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/FromKvParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/FromKvParams.java new file mode 100644 index 000000000..616d6b1bc --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/FromKvParams.java @@ -0,0 +1,7 @@ +package com.alibaba.alink.params.dataproc.format; + +public interface FromKvParams extends + HasKvCol , + HasKvColDelimiterDefaultAsComma , + HasKvValDelimiterDefaultAsColon { +} \ No newline at end of file diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/FromTripleParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/FromTripleParams.java new file mode 100644 index 000000000..edeae2856 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/FromTripleParams.java @@ -0,0 +1,46 @@ +package com.alibaba.alink.params.dataproc.format; + +import org.apache.flink.ml.api.misc.param.ParamInfo; +import org.apache.flink.ml.api.misc.param.ParamInfoFactory; + +public interface FromTripleParams extends + HasTripleRowCol { + + /** + * @cn-name 三元组结构中列信息的列名 + * @cn 三元组结构中列信息的列名 + */ + ParamInfo TRIPLE_COL_COL = ParamInfoFactory + .createParamInfo("tripleColCol", String.class) + .setDescription("Name of the triple row column") + .setAlias(new String[]{"tripleCol"}) + .setRequired() + .build(); + + default String getTripleColCol() { + return get(TRIPLE_COL_COL); + } + + default T setTripleColCol(String colName) { + return set(TRIPLE_COL_COL, colName); + } + + /** + * @cn-name 三元组结构中数据信息的列名 + * @cn 三元组结构中数据信息的列名 + */ + ParamInfo TRIPLE_VAL_COL = ParamInfoFactory + .createParamInfo("tripleValCol", String.class) + .setDescription("Name of the triple row column") + .setAlias(new String[]{"tripleVal"}) + .setRequired() + .build(); + + default String getTripleValCol() { + return get(TRIPLE_VAL_COL); + } + + default T setTripleValCol(String colName) { + return set(TRIPLE_VAL_COL, colName); + } +} \ No newline at end of file diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/FromVectorParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/FromVectorParams.java new file mode 100644 index 000000000..aae4c1879 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/FromVectorParams.java @@ -0,0 +1,7 @@ +package com.alibaba.alink.params.dataproc.format; + +import com.alibaba.alink.params.shared.colname.HasVectorCol; + +public interface FromVectorParams extends + HasVectorCol { +} \ No newline at end of file diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasCsvCol.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasCsvCol.java new file mode 100644 index 000000000..b55cc00bb --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasCsvCol.java @@ -0,0 +1,29 @@ +package com.alibaba.alink.params.dataproc.format; + +import org.apache.flink.ml.api.misc.param.ParamInfo; +import org.apache.flink.ml.api.misc.param.ParamInfoFactory; +import org.apache.flink.ml.api.misc.param.WithParams; + +/** + * An interface for classes with a parameter specifying the name of the table column. + */ +public interface HasCsvCol extends WithParams { + + /** + * @cn-name CSV列名 + * @cn CSV列的列名 + */ + ParamInfo CSV_COL = ParamInfoFactory + .createParamInfo("csvCol", String.class) + .setDescription("Name of the CSV column") + .setRequired() + .build(); + + default String getCsvCol() { + return get(CSV_COL); + } + + default T setCsvCol(String colName) { + return set(CSV_COL, colName); + } +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasCsvFieldDelimiterDefaultAsComma.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasCsvFieldDelimiterDefaultAsComma.java new file mode 100644 index 000000000..eef56437b --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasCsvFieldDelimiterDefaultAsComma.java @@ -0,0 +1,25 @@ +package com.alibaba.alink.params.dataproc.format; + +import org.apache.flink.ml.api.misc.param.ParamInfo; +import org.apache.flink.ml.api.misc.param.ParamInfoFactory; +import org.apache.flink.ml.api.misc.param.WithParams; + +public interface HasCsvFieldDelimiterDefaultAsComma extends WithParams { + /** + * @cn-name 字段分隔符 + * @cn 字段分隔符 + */ + ParamInfo CSV_FIELD_DELIMITER = ParamInfoFactory + .createParamInfo("csvFieldDelimiter", String.class) + .setDescription("Field delimiter") + .setHasDefaultValue(",") + .build(); + + default String getCsvFieldDelimiter() { + return get(CSV_FIELD_DELIMITER); + } + + default T setCsvFieldDelimiter(String value) { + return set(CSV_FIELD_DELIMITER, value); + } +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasHandleInvalidDefaultAsError.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasHandleInvalidDefaultAsError.java new file mode 100644 index 000000000..e21692ed7 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasHandleInvalidDefaultAsError.java @@ -0,0 +1,44 @@ +package com.alibaba.alink.params.dataproc.format; + +import com.alibaba.alink.params.ParamUtil; +import org.apache.flink.ml.api.misc.param.ParamInfo; +import org.apache.flink.ml.api.misc.param.ParamInfoFactory; +import org.apache.flink.ml.api.misc.param.WithParams; + +public interface HasHandleInvalidDefaultAsError extends WithParams { + /** + * @cn-name 解析异常处理策略 + * @cn 解析异常处理策略 + */ + ParamInfo HANDLE_INVALID = ParamInfoFactory + .createParamInfo("handleInvalid", HandleInvalid.class) + .setDescription("Strategy to handle unseen token") + .setHasDefaultValue(HandleInvalid.ERROR) + .build(); + + default HandleInvalid getHandleInvalid() { + return get(HANDLE_INVALID); + } + + default T setHandleInvalid(HandleInvalid value) { + return set(HANDLE_INVALID, value); + } + + default T setHandleInvalid(String value) { + return set(HANDLE_INVALID, ParamUtil.searchEnum(HANDLE_INVALID, value)); + } + + /** + * Strategy to handle unseen token when doing prediction. + */ + enum HandleInvalid { + /** + * Raise exception. + */ + ERROR, + /** + * Pad with null. + */ + SKIP + } +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasHandleInvalidDefaultAsSkip.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasHandleInvalidDefaultAsSkip.java new file mode 100644 index 000000000..580fe050f --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasHandleInvalidDefaultAsSkip.java @@ -0,0 +1,44 @@ +package com.alibaba.alink.params.dataproc.format; + +import com.alibaba.alink.params.ParamUtil; +import org.apache.flink.ml.api.misc.param.ParamInfo; +import org.apache.flink.ml.api.misc.param.ParamInfoFactory; +import org.apache.flink.ml.api.misc.param.WithParams; + +public interface HasHandleInvalidDefaultAsSkip extends WithParams { + /** + * @cn-name 解析异常处理策略 + * @cn 解析异常处理策略 + */ + ParamInfo HANDLE_INVALID = ParamInfoFactory + .createParamInfo("handleInvalid", HandleInvalid.class) + .setDescription("Strategy to handle unseen token") + .setHasDefaultValue(HandleInvalid.ERROR) + .build(); + + default HandleInvalid getHandleInvalid() { + return get(HANDLE_INVALID); + } + + default T setHandleInvalid(HandleInvalid value) { + return set(HANDLE_INVALID, value); + } + + default T setHandleInvalid(String value) { + return set(HANDLE_INVALID, ParamUtil.searchEnum(HANDLE_INVALID, value)); + } + + /** + * Strategy to handle unseen token when doing prediction. + */ + enum HandleInvalid { + /** + * Raise exception. + */ + ERROR, + /** + * Pad with null. + */ + SKIP + } +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasJsonCol.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasJsonCol.java new file mode 100644 index 000000000..c8611bb2c --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasJsonCol.java @@ -0,0 +1,29 @@ +package com.alibaba.alink.params.dataproc.format; + +import org.apache.flink.ml.api.misc.param.ParamInfo; +import org.apache.flink.ml.api.misc.param.ParamInfoFactory; +import org.apache.flink.ml.api.misc.param.WithParams; + +/** + * An interface for classes with a parameter specifying the name of the table column. + */ +public interface HasJsonCol extends WithParams { + + /** + * @cn-name JSON列名 + * @cn JSON列的列名 + */ + ParamInfo JSON_COL = ParamInfoFactory + .createParamInfo("jsonCol", String.class) + .setDescription("Name of the CSV column") + .setRequired() + .build(); + + default String getJsonCol() { + return get(JSON_COL); + } + + default T setJsonCol(String colName) { + return set(JSON_COL, colName); + } +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasKvCol.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasKvCol.java new file mode 100644 index 000000000..2905b559f --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasKvCol.java @@ -0,0 +1,30 @@ +package com.alibaba.alink.params.dataproc.format; + +import org.apache.flink.ml.api.misc.param.ParamInfo; +import org.apache.flink.ml.api.misc.param.ParamInfoFactory; +import org.apache.flink.ml.api.misc.param.WithParams; + +/** + * An interface for classes with a parameter specifying the name of the table column. + */ +public interface HasKvCol extends WithParams { + + /** + * @cn-name KV列名 + * @cn KV列的列名 + */ + ParamInfo KV_COL = ParamInfoFactory + .createParamInfo("kvCol", String.class) + .setDescription("Name of the KV column") + .setRequired() + .setAlias(new String[]{"selectedCol", "selectedColName"}) + .build(); + + default String getKvCol() { + return get(KV_COL); + } + + default T setKvCol(String colName) { + return set(KV_COL, colName); + } +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasKvColDelimiterDefaultAsComma.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasKvColDelimiterDefaultAsComma.java new file mode 100644 index 000000000..5d09bc9e4 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasKvColDelimiterDefaultAsComma.java @@ -0,0 +1,26 @@ +package com.alibaba.alink.params.dataproc.format; + +import org.apache.flink.ml.api.misc.param.ParamInfo; +import org.apache.flink.ml.api.misc.param.ParamInfoFactory; +import org.apache.flink.ml.api.misc.param.WithParams; + +public interface HasKvColDelimiterDefaultAsComma extends WithParams { + /** + * @cn-name 分隔符 + * @cn 当输入数据为稀疏格式时,key-value对之间的分隔符 + */ + ParamInfo KV_COL_DELIMITER = ParamInfoFactory + .createParamInfo("kvColDelimiter", String.class) + .setDescription("Delimiter used between key-value pairs when data in the input table is in sparse format") + .setAlias(new String[]{"kvDelimiter"}) + .setHasDefaultValue(",") + .build(); + + default String getKvColDelimiter() { + return get(KV_COL_DELIMITER); + } + + default T setKvColDelimiter(String value) { + return set(KV_COL_DELIMITER, value); + } +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasKvValDelimiterDefaultAsColon.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasKvValDelimiterDefaultAsColon.java new file mode 100644 index 000000000..a7385fead --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasKvValDelimiterDefaultAsColon.java @@ -0,0 +1,26 @@ +package com.alibaba.alink.params.dataproc.format; + +import org.apache.flink.ml.api.misc.param.ParamInfo; +import org.apache.flink.ml.api.misc.param.ParamInfoFactory; +import org.apache.flink.ml.api.misc.param.WithParams; + +public interface HasKvValDelimiterDefaultAsColon extends WithParams { + /** + * @cn-name 分隔符 + * @cn 当输入数据为稀疏格式时,key和value的分割符 + */ + ParamInfo KV_VAL_DELIMITER = ParamInfoFactory + .createParamInfo("kvValDelimiter", String.class) + .setDescription("Delimiter used between keys and values when data in the input table is in sparse format") + .setAlias(new String[]{"kvDelimiter"}) + .setHasDefaultValue(":") + .build(); + + default String getKvValDelimiter() { + return get(KV_VAL_DELIMITER); + } + + default T setKvValDelimiter(String value) { + return set(KV_VAL_DELIMITER, value); + } +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasTripleRowCol.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasTripleRowCol.java new file mode 100644 index 000000000..7c72b99fc --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasTripleRowCol.java @@ -0,0 +1,28 @@ +package com.alibaba.alink.params.dataproc.format; + +import org.apache.flink.ml.api.misc.param.ParamInfo; +import org.apache.flink.ml.api.misc.param.ParamInfoFactory; +import org.apache.flink.ml.api.misc.param.WithParams; + +/** + * An interface for classes with a parameter specifying the name of the table column. + */ +public interface HasTripleRowCol extends WithParams { + + /** + * @cn-name 三元组结构中行信息的列名 + * @cn 三元组结构中行信息的列名 + */ + ParamInfo TRIPLE_ROW_COL = ParamInfoFactory + .createParamInfo("tripleRowCol", String.class) + .setDescription("Name of the triple row column") + .build(); + + default String getTripleRowCol() { + return get(TRIPLE_ROW_COL); + } + + default T setTripleRowCol(String colName) { + return set(TRIPLE_ROW_COL, colName); + } +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasVectorSize.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasVectorSize.java new file mode 100644 index 000000000..ec3cf75be --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/HasVectorSize.java @@ -0,0 +1,33 @@ +package com.alibaba.alink.params.dataproc.format; + +import org.apache.flink.ml.api.misc.param.ParamInfo; +import org.apache.flink.ml.api.misc.param.ParamInfoFactory; +import org.apache.flink.ml.api.misc.param.WithParams; + +/** + * Trait for parameter vectorSize. + */ +public interface HasVectorSize extends WithParams { + + /** + * @cn-name 向量长度 + * @cn 向量长度 + */ + ParamInfo VECTOR_SIZE = ParamInfoFactory + .createParamInfo("vectorSize", Long.class) + .setDescription("Size of the vector") + .setHasDefaultValue(-1L) + .build(); + + default Long getVectorSize() { + return get(VECTOR_SIZE); + } + + default T setVectorSize(Long size) { + return set(VECTOR_SIZE, size); + } + + default T setVectorSize(Integer size) { + return set(VECTOR_SIZE, size.longValue()); + } +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/JsonToColumnsParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/JsonToColumnsParams.java new file mode 100644 index 000000000..1c54461e5 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/JsonToColumnsParams.java @@ -0,0 +1,8 @@ + +package com.alibaba.alink.params.dataproc.format; + +public interface JsonToColumnsParams extends + ToColumnsParams, + FromJsonParams, + HasHandleInvalidDefaultAsError { +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/JsonToCsvParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/JsonToCsvParams.java new file mode 100644 index 000000000..7970c4f68 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/JsonToCsvParams.java @@ -0,0 +1,8 @@ + +package com.alibaba.alink.params.dataproc.format; + +public interface JsonToCsvParams extends + ToCsvParams, + FromJsonParams, + HasHandleInvalidDefaultAsError { +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/JsonToKvParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/JsonToKvParams.java new file mode 100644 index 000000000..55c166e3c --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/JsonToKvParams.java @@ -0,0 +1,8 @@ + +package com.alibaba.alink.params.dataproc.format; + +public interface JsonToKvParams extends + ToKvParams, + FromJsonParams, + HasHandleInvalidDefaultAsError { +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/JsonToTripleParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/JsonToTripleParams.java new file mode 100644 index 000000000..2ee543bb1 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/JsonToTripleParams.java @@ -0,0 +1,5 @@ +package com.alibaba.alink.params.dataproc.format; + +public interface JsonToTripleParams extends + FromJsonParams, + HasHandleInvalidDefaultAsError {} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/JsonToVectorParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/JsonToVectorParams.java new file mode 100644 index 000000000..2f089a7be --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/JsonToVectorParams.java @@ -0,0 +1,8 @@ + +package com.alibaba.alink.params.dataproc.format; + +public interface JsonToVectorParams extends + ToVectorParams, + FromJsonParams, + HasHandleInvalidDefaultAsError { +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/KvToColumnsParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/KvToColumnsParams.java new file mode 100644 index 000000000..ece287e2b --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/KvToColumnsParams.java @@ -0,0 +1,8 @@ + +package com.alibaba.alink.params.dataproc.format; + +public interface KvToColumnsParams extends + ToColumnsParams, + FromKvParams, + HasHandleInvalidDefaultAsError { +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/KvToCsvParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/KvToCsvParams.java new file mode 100644 index 000000000..b24687d5b --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/KvToCsvParams.java @@ -0,0 +1,8 @@ + +package com.alibaba.alink.params.dataproc.format; + +public interface KvToCsvParams extends + ToCsvParams, + FromKvParams, + HasHandleInvalidDefaultAsError { +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/KvToJsonParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/KvToJsonParams.java new file mode 100644 index 000000000..1d63a54d2 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/KvToJsonParams.java @@ -0,0 +1,8 @@ + +package com.alibaba.alink.params.dataproc.format; + +public interface KvToJsonParams extends + ToJsonParams, + FromKvParams, + HasHandleInvalidDefaultAsError { +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/KvToTripleParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/KvToTripleParams.java new file mode 100644 index 000000000..fb61faed2 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/KvToTripleParams.java @@ -0,0 +1,5 @@ +package com.alibaba.alink.params.dataproc.format; + +public interface KvToTripleParams extends + FromKvParams, + HasHandleInvalidDefaultAsError {} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/KvToVectorParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/KvToVectorParams.java new file mode 100644 index 000000000..a4661cb67 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/KvToVectorParams.java @@ -0,0 +1,8 @@ + +package com.alibaba.alink.params.dataproc.format; + +public interface KvToVectorParams extends + ToVectorParams, + FromKvParams, + HasHandleInvalidDefaultAsError { +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/ToColumnsParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/ToColumnsParams.java new file mode 100644 index 000000000..521bd08cf --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/ToColumnsParams.java @@ -0,0 +1,9 @@ +package com.alibaba.alink.params.dataproc.format; + +import com.alibaba.alink.params.io.HasSchemaStr; +import com.alibaba.alink.params.shared.colname.HasReservedColsDefaultAsNull; + +public interface ToColumnsParams extends + HasReservedColsDefaultAsNull, + HasSchemaStr { +} \ No newline at end of file diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/ToCsvParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/ToCsvParams.java new file mode 100644 index 000000000..b9cfe9a11 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/ToCsvParams.java @@ -0,0 +1,13 @@ +package com.alibaba.alink.params.dataproc.format; + +import com.alibaba.alink.params.io.HasQuoteCharDefaultAsDoubleQuote; +import com.alibaba.alink.params.io.HasSchemaStr; +import com.alibaba.alink.params.shared.colname.HasReservedColsDefaultAsNull; + +public interface ToCsvParams extends + HasReservedColsDefaultAsNull, + HasCsvCol , + HasSchemaStr , + HasCsvFieldDelimiterDefaultAsComma, + HasQuoteCharDefaultAsDoubleQuote { +} \ No newline at end of file diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/ToJsonParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/ToJsonParams.java new file mode 100644 index 000000000..ff2b2dcad --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/ToJsonParams.java @@ -0,0 +1,8 @@ +package com.alibaba.alink.params.dataproc.format; + +import com.alibaba.alink.params.shared.colname.HasReservedColsDefaultAsNull; + +public interface ToJsonParams extends + HasReservedColsDefaultAsNull, + HasJsonCol { +} \ No newline at end of file diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/ToKvParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/ToKvParams.java new file mode 100644 index 000000000..81c855094 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/ToKvParams.java @@ -0,0 +1,10 @@ +package com.alibaba.alink.params.dataproc.format; + +import com.alibaba.alink.params.shared.colname.HasReservedColsDefaultAsNull; + +public interface ToKvParams extends + HasReservedColsDefaultAsNull, + HasKvCol, + HasKvColDelimiterDefaultAsComma, + HasKvValDelimiterDefaultAsColon { +} \ No newline at end of file diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/ToTripleParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/ToTripleParams.java new file mode 100644 index 000000000..08bac2c2d --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/ToTripleParams.java @@ -0,0 +1,46 @@ +package com.alibaba.alink.params.dataproc.format; + +import org.apache.flink.ml.api.misc.param.ParamInfo; +import org.apache.flink.ml.api.misc.param.ParamInfoFactory; +import org.apache.flink.ml.api.misc.param.WithParams; + +public interface ToTripleParams extends WithParams { + + /** + * @cn-name 三元组结构中列信息和数据信息的Schema + * @cn 三元组结构中列信息和数据信息的Schema + */ + ParamInfo TRIPLE_COL_VAL_SCHEMA_STR = ParamInfoFactory + .createParamInfo("tripleColValSchemaStr", String.class) + .setDescription("Schema string of the triple's col and val column") + .setRequired() + .build(); + + default String getTripleColValSchemaStr() { + return get(TRIPLE_COL_VAL_SCHEMA_STR); + } + + default T setTripleColValSchemaStr(String colName) { + return set(TRIPLE_COL_VAL_SCHEMA_STR, colName); + } + + /** + * @cn-name 算法保留列名 + * @cn 算法保留列 + */ + ParamInfo RESERVED_COLS = ParamInfoFactory + .createParamInfo("reservedCols", String[].class) + .setDescription("Names of the columns to be retained in the output table") + .setAlias(new String[] {"keepColNames"}) + .setHasDefaultValue(new String[0]) + .build(); + + default String[] getReservedCols() { + return get(RESERVED_COLS); + } + + default T setReservedCols(String... colNames) { + return set(RESERVED_COLS, colNames); + } + +} \ No newline at end of file diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/ToVectorParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/ToVectorParams.java new file mode 100644 index 000000000..a148611e3 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/ToVectorParams.java @@ -0,0 +1,10 @@ +package com.alibaba.alink.params.dataproc.format; + +import com.alibaba.alink.params.shared.colname.HasReservedColsDefaultAsNull; +import com.alibaba.alink.params.shared.colname.HasVectorCol; + +public interface ToVectorParams extends + HasReservedColsDefaultAsNull, + HasVectorCol , + HasVectorSize { +} \ No newline at end of file diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/TripleToColumnsParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/TripleToColumnsParams.java new file mode 100644 index 000000000..c5358265e --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/TripleToColumnsParams.java @@ -0,0 +1,5 @@ +package com.alibaba.alink.params.dataproc.format; + +public interface TripleToColumnsParams extends + ToColumnsParams, + HasHandleInvalidDefaultAsError {} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/TripleToCsvParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/TripleToCsvParams.java new file mode 100644 index 000000000..4b3be7d53 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/TripleToCsvParams.java @@ -0,0 +1,5 @@ +package com.alibaba.alink.params.dataproc.format; + +public interface TripleToCsvParams extends + ToCsvParams, + HasHandleInvalidDefaultAsError {} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/TripleToJsonParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/TripleToJsonParams.java new file mode 100644 index 000000000..03bb8fe60 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/TripleToJsonParams.java @@ -0,0 +1,5 @@ +package com.alibaba.alink.params.dataproc.format; + +public interface TripleToJsonParams extends + ToJsonParams, + HasHandleInvalidDefaultAsError {} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/TripleToKvParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/TripleToKvParams.java new file mode 100644 index 000000000..f8e300ef1 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/TripleToKvParams.java @@ -0,0 +1,5 @@ +package com.alibaba.alink.params.dataproc.format; + +public interface TripleToKvParams extends + ToKvParams, + HasHandleInvalidDefaultAsError {} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/TripleToVectorParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/TripleToVectorParams.java new file mode 100644 index 000000000..ae79f7f00 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/TripleToVectorParams.java @@ -0,0 +1,5 @@ +package com.alibaba.alink.params.dataproc.format; + +public interface TripleToVectorParams extends + ToVectorParams, + HasHandleInvalidDefaultAsError {} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/VectorToColumnsParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/VectorToColumnsParams.java new file mode 100644 index 000000000..03f7969b3 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/VectorToColumnsParams.java @@ -0,0 +1,8 @@ + +package com.alibaba.alink.params.dataproc.format; + +public interface VectorToColumnsParams extends + ToColumnsParams, + FromVectorParams, + HasHandleInvalidDefaultAsError { +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/VectorToCsvParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/VectorToCsvParams.java new file mode 100644 index 000000000..7bcfdff1d --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/VectorToCsvParams.java @@ -0,0 +1,8 @@ + +package com.alibaba.alink.params.dataproc.format; + +public interface VectorToCsvParams extends + ToCsvParams, + FromVectorParams, + HasHandleInvalidDefaultAsError { +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/VectorToJsonParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/VectorToJsonParams.java new file mode 100644 index 000000000..bce4f1602 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/VectorToJsonParams.java @@ -0,0 +1,8 @@ + +package com.alibaba.alink.params.dataproc.format; + +public interface VectorToJsonParams extends + ToJsonParams, + FromVectorParams, + HasHandleInvalidDefaultAsError { +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/VectorToKvParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/VectorToKvParams.java new file mode 100644 index 000000000..ac4301698 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/VectorToKvParams.java @@ -0,0 +1,8 @@ + +package com.alibaba.alink.params.dataproc.format; + +public interface VectorToKvParams extends + ToKvParams, + FromVectorParams, + HasHandleInvalidDefaultAsError { +} diff --git a/core/src/main/java/com/alibaba/alink/params/dataproc/format/VectorToTripleParams.java b/core/src/main/java/com/alibaba/alink/params/dataproc/format/VectorToTripleParams.java new file mode 100644 index 000000000..099068d60 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/dataproc/format/VectorToTripleParams.java @@ -0,0 +1,5 @@ +package com.alibaba.alink.params.dataproc.format; + +public interface VectorToTripleParams extends + FromVectorParams, + HasHandleInvalidDefaultAsError {} \ No newline at end of file diff --git a/core/src/main/java/com/alibaba/alink/params/shared/colname/HasReservedColsDefaultAsNull.java b/core/src/main/java/com/alibaba/alink/params/shared/colname/HasReservedColsDefaultAsNull.java new file mode 100644 index 000000000..87876f410 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/params/shared/colname/HasReservedColsDefaultAsNull.java @@ -0,0 +1,26 @@ +package com.alibaba.alink.params.shared.colname; + +import org.apache.flink.ml.api.misc.param.ParamInfo; +import org.apache.flink.ml.api.misc.param.ParamInfoFactory; +import org.apache.flink.ml.api.misc.param.WithParams; + +/** + * An interface for classes with a parameter specifying the names of the columns to be retained in the output table. + */ +public interface HasReservedColsDefaultAsNull extends WithParams { + + ParamInfo RESERVED_COLS = ParamInfoFactory + .createParamInfo("reservedCols", String[].class) + .setDescription("Names of the columns to be retained in the output table") + .setAlias(new String[] {"keepColNames"}) + .setHasDefaultValue(null) + .build(); + + default String[] getReservedCols() { + return get(RESERVED_COLS); + } + + default T setReservedCols(String... colNames) { + return set(RESERVED_COLS, colNames); + } +} diff --git a/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/BaseFormatTrans.java b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/BaseFormatTrans.java new file mode 100644 index 000000000..cf9d0dd49 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/BaseFormatTrans.java @@ -0,0 +1,21 @@ +package com.alibaba.alink.pipeline.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatTransMapper; +import com.alibaba.alink.operator.common.dataproc.format.FormatTransParams; +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.pipeline.MapTransformer; +import org.apache.flink.ml.api.misc.param.Params; + +public class BaseFormatTrans> extends MapTransformer { + public BaseFormatTrans(FormatType fromFormat, FormatType toFormat, Params params) { + this( + (null == params ? new Params() : params) + .set(FormatTransParams.FROM_FORMAT, fromFormat) + .set(FormatTransParams.TO_FORMAT, toFormat) + ); + } + + private BaseFormatTrans(Params params) { + super(FormatTransMapper::new, params); + } +} diff --git a/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/ColumnsToCsv.java b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/ColumnsToCsv.java new file mode 100644 index 000000000..5f18ac28c --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/ColumnsToCsv.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.pipeline.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.ColumnsToCsvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class ColumnsToCsv extends BaseFormatTrans implements ColumnsToCsvParams { + + public ColumnsToCsv() { + this(new Params()); + } + + public ColumnsToCsv(Params params) { + super(FormatType.COLUMNS, FormatType.CSV, params); + } +} + diff --git a/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/ColumnsToJson.java b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/ColumnsToJson.java new file mode 100644 index 000000000..665384333 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/ColumnsToJson.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.pipeline.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.ColumnsToJsonParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class ColumnsToJson extends BaseFormatTrans implements ColumnsToJsonParams { + + public ColumnsToJson() { + this(new Params()); + } + + public ColumnsToJson(Params params) { + super(FormatType.COLUMNS, FormatType.JSON, params); + } +} + diff --git a/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/ColumnsToKv.java b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/ColumnsToKv.java new file mode 100644 index 000000000..405c792ee --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/ColumnsToKv.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.pipeline.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.ColumnsToKvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class ColumnsToKv extends BaseFormatTrans implements ColumnsToKvParams { + + public ColumnsToKv() { + this(new Params()); + } + + public ColumnsToKv(Params params) { + super(FormatType.COLUMNS, FormatType.KV, params); + } +} + diff --git a/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/ColumnsToVector.java b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/ColumnsToVector.java new file mode 100644 index 000000000..5c1c887e5 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/ColumnsToVector.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.pipeline.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.ColumnsToVectorParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class ColumnsToVector extends BaseFormatTrans implements ColumnsToVectorParams { + + public ColumnsToVector() { + this(new Params()); + } + + public ColumnsToVector(Params params) { + super(FormatType.COLUMNS, FormatType.VECTOR, params); + } +} + diff --git a/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/CsvToColumns.java b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/CsvToColumns.java new file mode 100644 index 000000000..f66d0d12a --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/CsvToColumns.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.pipeline.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.CsvToColumnsParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class CsvToColumns extends BaseFormatTrans implements CsvToColumnsParams { + + public CsvToColumns() { + this(new Params()); + } + + public CsvToColumns(Params params) { + super(FormatType.CSV, FormatType.COLUMNS, params); + } +} + diff --git a/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/CsvToJson.java b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/CsvToJson.java new file mode 100644 index 000000000..50487d0cc --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/CsvToJson.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.pipeline.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.CsvToJsonParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class CsvToJson extends BaseFormatTrans implements CsvToJsonParams { + + public CsvToJson() { + this(new Params()); + } + + public CsvToJson(Params params) { + super(FormatType.CSV, FormatType.JSON, params); + } +} + diff --git a/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/CsvToKv.java b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/CsvToKv.java new file mode 100644 index 000000000..7b9f55811 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/CsvToKv.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.pipeline.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.CsvToKvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class CsvToKv extends BaseFormatTrans implements CsvToKvParams { + + public CsvToKv() { + this(new Params()); + } + + public CsvToKv(Params params) { + super(FormatType.CSV, FormatType.KV, params); + } +} + diff --git a/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/CsvToVector.java b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/CsvToVector.java new file mode 100644 index 000000000..14aeeb320 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/CsvToVector.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.pipeline.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.CsvToVectorParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class CsvToVector extends BaseFormatTrans implements CsvToVectorParams { + + public CsvToVector() { + this(new Params()); + } + + public CsvToVector(Params params) { + super(FormatType.CSV, FormatType.VECTOR, params); + } +} + diff --git a/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/JsonToColumns.java b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/JsonToColumns.java new file mode 100644 index 000000000..d03745953 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/JsonToColumns.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.pipeline.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.JsonToColumnsParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class JsonToColumns extends BaseFormatTrans implements JsonToColumnsParams { + + public JsonToColumns() { + this(new Params()); + } + + public JsonToColumns(Params params) { + super(FormatType.JSON, FormatType.COLUMNS, params); + } +} + diff --git a/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/JsonToCsv.java b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/JsonToCsv.java new file mode 100644 index 000000000..b7e35e2e2 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/JsonToCsv.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.pipeline.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.JsonToCsvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class JsonToCsv extends BaseFormatTrans implements JsonToCsvParams { + + public JsonToCsv() { + this(new Params()); + } + + public JsonToCsv(Params params) { + super(FormatType.JSON, FormatType.CSV, params); + } +} + diff --git a/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/JsonToKv.java b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/JsonToKv.java new file mode 100644 index 000000000..31fe72afa --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/JsonToKv.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.pipeline.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.JsonToKvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class JsonToKv extends BaseFormatTrans implements JsonToKvParams { + + public JsonToKv() { + this(new Params()); + } + + public JsonToKv(Params params) { + super(FormatType.JSON, FormatType.KV, params); + } +} + diff --git a/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/JsonToVector.java b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/JsonToVector.java new file mode 100644 index 000000000..f9c2af526 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/JsonToVector.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.pipeline.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.JsonToVectorParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class JsonToVector extends BaseFormatTrans implements JsonToVectorParams { + + public JsonToVector() { + this(new Params()); + } + + public JsonToVector(Params params) { + super(FormatType.JSON, FormatType.VECTOR, params); + } +} + diff --git a/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/KvToColumns.java b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/KvToColumns.java new file mode 100644 index 000000000..d18191b7e --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/KvToColumns.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.pipeline.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.KvToColumnsParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class KvToColumns extends BaseFormatTrans implements KvToColumnsParams { + + public KvToColumns() { + this(new Params()); + } + + public KvToColumns(Params params) { + super(FormatType.KV, FormatType.COLUMNS, params); + } +} + diff --git a/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/KvToCsv.java b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/KvToCsv.java new file mode 100644 index 000000000..fd83a6ea0 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/KvToCsv.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.pipeline.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.KvToCsvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class KvToCsv extends BaseFormatTrans implements KvToCsvParams { + + public KvToCsv() { + this(new Params()); + } + + public KvToCsv(Params params) { + super(FormatType.KV, FormatType.CSV, params); + } +} + diff --git a/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/KvToJson.java b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/KvToJson.java new file mode 100644 index 000000000..1a15b1670 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/KvToJson.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.pipeline.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.KvToJsonParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class KvToJson extends BaseFormatTrans implements KvToJsonParams { + + public KvToJson() { + this(new Params()); + } + + public KvToJson(Params params) { + super(FormatType.KV, FormatType.JSON, params); + } +} + diff --git a/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/KvToVector.java b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/KvToVector.java new file mode 100644 index 000000000..7d59d2851 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/KvToVector.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.pipeline.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.KvToVectorParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class KvToVector extends BaseFormatTrans implements KvToVectorParams { + + public KvToVector() { + this(new Params()); + } + + public KvToVector(Params params) { + super(FormatType.KV, FormatType.VECTOR, params); + } +} + diff --git a/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/VectorToColumns.java b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/VectorToColumns.java new file mode 100644 index 000000000..12529dcbe --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/VectorToColumns.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.pipeline.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.VectorToColumnsParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class VectorToColumns extends BaseFormatTrans implements VectorToColumnsParams { + + public VectorToColumns() { + this(new Params()); + } + + public VectorToColumns(Params params) { + super(FormatType.VECTOR, FormatType.COLUMNS, params); + } +} + diff --git a/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/VectorToCsv.java b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/VectorToCsv.java new file mode 100644 index 000000000..a420e0a53 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/VectorToCsv.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.pipeline.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.VectorToCsvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class VectorToCsv extends BaseFormatTrans implements VectorToCsvParams { + + public VectorToCsv() { + this(new Params()); + } + + public VectorToCsv(Params params) { + super(FormatType.VECTOR, FormatType.CSV, params); + } +} + diff --git a/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/VectorToJson.java b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/VectorToJson.java new file mode 100644 index 000000000..d6b2f3f05 --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/VectorToJson.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.pipeline.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.VectorToJsonParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class VectorToJson extends BaseFormatTrans implements VectorToJsonParams { + + public VectorToJson() { + this(new Params()); + } + + public VectorToJson(Params params) { + super(FormatType.VECTOR, FormatType.JSON, params); + } +} + diff --git a/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/VectorToKv.java b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/VectorToKv.java new file mode 100644 index 000000000..6e00844ea --- /dev/null +++ b/core/src/main/java/com/alibaba/alink/pipeline/dataproc/format/VectorToKv.java @@ -0,0 +1,18 @@ + +package com.alibaba.alink.pipeline.dataproc.format; + +import com.alibaba.alink.operator.common.dataproc.format.FormatType; +import com.alibaba.alink.params.dataproc.format.VectorToKvParams; +import org.apache.flink.ml.api.misc.param.Params; + +public class VectorToKv extends BaseFormatTrans implements VectorToKvParams { + + public VectorToKv() { + this(new Params()); + } + + public VectorToKv(Params params) { + super(FormatType.VECTOR, FormatType.KV, params); + } +} + diff --git a/core/src/test/java/com/alibaba/alink/operator/common/dataproc/format/AnyToTripleFlatMapperTest.java b/core/src/test/java/com/alibaba/alink/operator/common/dataproc/format/AnyToTripleFlatMapperTest.java new file mode 100644 index 000000000..2a6da5ed8 --- /dev/null +++ b/core/src/test/java/com/alibaba/alink/operator/common/dataproc/format/AnyToTripleFlatMapperTest.java @@ -0,0 +1,39 @@ +package com.alibaba.alink.operator.common.dataproc.format; + +import com.alibaba.alink.params.dataproc.format.FromKvParams; +import com.alibaba.alink.params.dataproc.format.ToTripleParams; +import org.apache.flink.ml.api.misc.param.Params; +import org.apache.flink.types.Row; + +import com.alibaba.alink.common.utils.RowCollector; +import com.alibaba.alink.operator.common.io.csv.CsvUtil; +import org.junit.Assert; +import org.junit.Test; + +public class AnyToTripleFlatMapperTest { + @Test + public void flatMap() throws Exception { + + AnyToTripleFlatMapper transKvToTriple = new AnyToTripleFlatMapper( + CsvUtil.schemaStr2Schema("row_id long, kv string"), + new Params() + .set(FormatTransParams.FROM_FORMAT, FormatType.KV) + .set(FromKvParams.KV_COL, "kv") + .set(FromKvParams.KV_COL_DELIMITER, ",") + .set(FromKvParams.KV_VAL_DELIMITER, ":") + .set(ToTripleParams.TRIPLE_COL_VAL_SCHEMA_STR, "col_id int, val double") + .set(ToTripleParams.RESERVED_COLS, new String[] {"row_id"}) + ); + transKvToTriple.open(); + + RowCollector collector = new RowCollector(); + transKvToTriple.flatMap(Row.of(3L, "1:1.0,4:1.0"), collector); + + Assert.assertEquals(collector.getRows().size(), 2); +// for (Row row : collector.getRows()) { +// System.out.println(row); +// } + + } + +} \ No newline at end of file diff --git a/core/src/test/java/com/alibaba/alink/operator/common/dataproc/format/FormatTransMapperTest.java b/core/src/test/java/com/alibaba/alink/operator/common/dataproc/format/FormatTransMapperTest.java new file mode 100644 index 000000000..ddb5e2cdf --- /dev/null +++ b/core/src/test/java/com/alibaba/alink/operator/common/dataproc/format/FormatTransMapperTest.java @@ -0,0 +1,216 @@ +package com.alibaba.alink.operator.common.dataproc.format; + +import com.alibaba.alink.params.dataproc.format.*; +import org.apache.flink.ml.api.misc.param.Params; +import org.apache.flink.types.Row; + +import com.alibaba.alink.operator.common.io.csv.CsvUtil; +import org.junit.Assert; +import org.junit.Test; + +public class FormatTransMapperTest { + + @Test + public void testSparse() throws Exception { + //String vecStr = "1.1 2,2 3.3 4.4 5.5"; + + //String vecStr = "1.1 2.2 3.3 4.4 5.5"; + String vecStr = "$5$0:1.1 2:3.3 3:4.4"; + + String kvStr = "0:1.1,2:3.3,3:4.4"; + + //Vector vec = new DenseVector(new double[]{1.1, 2.2, 3.3, 4.4, 5.5}); + + Row columns = Row.of(1.1, 2.2, 3.3, 4.4, 5.5); + + String schemaStr = "f1 double, f2 double, f3 double, f4 double, f5 double"; + + FormatTransMapper transVectorToColumns = new FormatTransMapper( + CsvUtil.schemaStr2Schema("vec string"), + //new TableSchema(new String[]{"vec"}, new TypeInformation[]{Types.}) + new Params() + .set(FormatTransParams.FROM_FORMAT, FormatType.VECTOR) + .set(FormatTransParams.TO_FORMAT, FormatType.COLUMNS) + .set(VectorToColumnsParams.VECTOR_COL, "vec") + .set(VectorToColumnsParams.SCHEMA_STR, schemaStr) + ); + transVectorToColumns.open(); + + Row resultVectorToColumns = transVectorToColumns.map(Row.of(vecStr)); + + System.out.println(resultVectorToColumns); + Assert.assertEquals( + "1.1", + resultVectorToColumns.getField(resultVectorToColumns.getArity() - 5).toString() + ); + + FormatTransMapper transColumnsToVector = new FormatTransMapper( + CsvUtil.schemaStr2Schema(schemaStr), + new Params() + .set(FormatTransParams.FROM_FORMAT, FormatType.COLUMNS) + .set(FormatTransParams.TO_FORMAT, FormatType.VECTOR) + .set(ColumnsToVectorParams.SELECTED_COLS, new String[] {"f1", "f2", "f3", "f4", "f5"}) + .set(ColumnsToVectorParams.VECTOR_COL, "vec") + ); + transColumnsToVector.open(); + + Row resultColumnsToVector = transColumnsToVector.map(columns); + + System.out.println(resultColumnsToVector); + Assert.assertEquals( + "1.1 2.2 3.3 4.4 5.5", + resultColumnsToVector.getField(resultColumnsToVector.getArity() - 1).toString() + ); + + FormatTransMapper transKvToVector = new FormatTransMapper( + CsvUtil.schemaStr2Schema("kv string"), + new Params() + .set(FormatTransParams.FROM_FORMAT, FormatType.KV) + .set(FormatTransParams.TO_FORMAT, FormatType.VECTOR) + .set(KvToVectorParams.KV_COL, "kv") + .set(KvToVectorParams.KV_VAL_DELIMITER, ":") + .set(KvToVectorParams.KV_COL_DELIMITER, ",") + .set(KvToVectorParams.VECTOR_COL, "vec") + .set(KvToVectorParams.VECTOR_SIZE, 5L) + ); + transKvToVector.open(); + + Row resultKvToVector = transKvToVector.map(Row.of(kvStr)); + + System.out.println(resultKvToVector); + Assert.assertEquals( + vecStr.length(), + resultKvToVector.getField(resultKvToVector.getArity() - 1).toString().length() + ); + + FormatTransMapper transVectorToKv = new FormatTransMapper( + CsvUtil.schemaStr2Schema("vec string"), + new Params() + .set(FormatTransParams.FROM_FORMAT, FormatType.VECTOR) + .set(FormatTransParams.TO_FORMAT, FormatType.KV) + .set(VectorToKvParams.VECTOR_COL, "vec") + .set(VectorToKvParams.KV_COL, "kv") + .set(VectorToKvParams.KV_VAL_DELIMITER, ":") + .set(VectorToKvParams.KV_COL_DELIMITER, ",") + ); + transVectorToKv.open(); + + Row resultVectorToKv = transVectorToKv.map(Row.of(vecStr)); + + System.out.println(resultVectorToKv); + Assert.assertEquals( + kvStr.length(), + resultVectorToKv.getField(resultVectorToKv.getArity() - 1).toString().length() + ); + + } + + @Test + public void testDense() throws Exception { + String csvStr = "1$2.0$false$val$2018-09-10$14:22:20$2018-09-10 14:22:20"; + String kvStr = "f1=1,f2=2.0,f3=false,f4=val,f5=2018-09-10,f6=14:22:20,f7=2018-09-10 14:22:20"; + String jsonStr = "{\"f6\":\"14:22:20\",\"f7\":\"2018-09-10 14:22:20\",\"f1\":\"1\",\"f2\":\"2.0\"," + + "\"f3\":\"false\",\"f4\":\"val\",\"f5\":\"2018-09-10\"}"; + + String schemaStr = "f1 bigint, f2 double, f3 boolean, f4 string, f5 date, f6 time, f7 timestamp"; + + FormatTransMapper transJsonToColumns = new FormatTransMapper( + CsvUtil.schemaStr2Schema("json string"), + new Params() + .set(FormatTransParams.FROM_FORMAT, FormatType.JSON) + .set(FormatTransParams.TO_FORMAT, FormatType.COLUMNS) + .set(JsonToColumnsParams.JSON_COL, "json") + .set(JsonToColumnsParams.SCHEMA_STR, schemaStr) + ); + transJsonToColumns.open(); + + Row resultJsonToColumns = transJsonToColumns.map(Row.of(jsonStr)); + + System.out.println(resultJsonToColumns); + Assert.assertEquals( + "1", + resultJsonToColumns.getField(resultJsonToColumns.getArity() - 7).toString() + ); + + FormatTransMapper transKvToColumns = new FormatTransMapper( + CsvUtil.schemaStr2Schema("kv string"), + new Params() + .set(FormatTransParams.FROM_FORMAT, FormatType.KV) + .set(FormatTransParams.TO_FORMAT, FormatType.COLUMNS) + .set(KvToColumnsParams.KV_COL, "kv") + .set(KvToColumnsParams.KV_COL_DELIMITER, ",") + .set(KvToColumnsParams.KV_VAL_DELIMITER, "=") + .set(KvToColumnsParams.SCHEMA_STR, schemaStr) + ); + transKvToColumns.open(); + + Row resultKvToColumns = transKvToColumns.map(Row.of(kvStr)); + + System.out.println(resultKvToColumns); + Assert.assertEquals( + "1", + resultKvToColumns.getField(resultKvToColumns.getArity() - 7).toString() + ); + + FormatTransMapper transKvToCsv = new FormatTransMapper( + CsvUtil.schemaStr2Schema("kv string"), + new Params() + .set(FormatTransParams.FROM_FORMAT, FormatType.KV) + .set(FormatTransParams.TO_FORMAT, FormatType.CSV) + .set(KvToCsvParams.KV_COL, "kv") + .set(KvToCsvParams.KV_COL_DELIMITER, ",") + .set(KvToCsvParams.KV_VAL_DELIMITER, "=") + .set(KvToCsvParams.SCHEMA_STR, schemaStr) + .set(KvToCsvParams.CSV_COL, "csv") + .set(KvToCsvParams.CSV_FIELD_DELIMITER, "$") + ); + transKvToCsv.open(); + + Row resultKvToCsv = transKvToCsv.map(Row.of(kvStr)); + + System.out.println(resultKvToCsv); + Assert.assertEquals(csvStr, resultKvToCsv.getField(resultKvToCsv.getArity() - 1)); + + FormatTransMapper transCsvToKv = new FormatTransMapper( + CsvUtil.schemaStr2Schema("csv string"), + new Params() + .set(FormatTransParams.FROM_FORMAT, FormatType.CSV) + .set(FormatTransParams.TO_FORMAT, FormatType.KV) + .set(CsvToKvParams.SCHEMA_STR, schemaStr) + .set(CsvToKvParams.CSV_COL, "csv") + .set(CsvToKvParams.CSV_FIELD_DELIMITER, "$") + .set(CsvToKvParams.KV_COL, "kv") + .set(CsvToKvParams.KV_COL_DELIMITER, ",") + .set(CsvToKvParams.KV_VAL_DELIMITER, "=") + ); + transCsvToKv.open(); + + Row resultCsvToKv = transCsvToKv.map(Row.of(csvStr)); + + System.out.println(resultCsvToKv); + Assert.assertEquals( + kvStr.length() + 2, + resultCsvToKv.getField(resultCsvToKv.getArity() - 1).toString().length() + ); + + FormatTransMapper transKvToJson = new FormatTransMapper( + CsvUtil.schemaStr2Schema("kv string"), + new Params() + .set(FormatTransParams.FROM_FORMAT, FormatType.KV) + .set(FormatTransParams.TO_FORMAT, FormatType.JSON) + .set(KvToJsonParams.KV_COL, "kv") + .set(KvToJsonParams.KV_COL_DELIMITER, ",") + .set(KvToJsonParams.KV_VAL_DELIMITER, "=") + .set(KvToJsonParams.JSON_COL, "json") + ); + transKvToJson.open(); + + Row resultKvToJson = transKvToJson.map(Row.of(kvStr)); + + System.out.println(resultKvToJson); + Assert.assertEquals(jsonStr.length(), resultKvToJson.getField(resultKvToJson.getArity() - 1).toString() + .length()); + + } + +} \ No newline at end of file diff --git a/core/src/test/java/com/alibaba/alink/operator/common/dataproc/format/TripleToAnyBatchOpTest.java b/core/src/test/java/com/alibaba/alink/operator/common/dataproc/format/TripleToAnyBatchOpTest.java new file mode 100644 index 000000000..8e758df9d --- /dev/null +++ b/core/src/test/java/com/alibaba/alink/operator/common/dataproc/format/TripleToAnyBatchOpTest.java @@ -0,0 +1,66 @@ +package com.alibaba.alink.operator.common.dataproc.format; + +import com.alibaba.alink.operator.batch.dataproc.format.TripleToAnyBatchOp; +import com.alibaba.alink.params.dataproc.format.FromTripleParams; +import org.apache.flink.ml.api.misc.param.Params; +import org.apache.flink.types.Row; + +import com.alibaba.alink.common.MLEnvironmentFactory; +import com.alibaba.alink.operator.batch.BatchOperator; +import com.alibaba.alink.operator.batch.source.TableSourceBatchOp; +import com.alibaba.alink.params.dataproc.format.ToKvParams; +import com.alibaba.alink.params.dataproc.format.ToVectorParams; +import org.junit.Assert; +import org.junit.Test; + +import java.util.Arrays; + +public class TripleToAnyBatchOpTest { + @Test + public void linkFrom() throws Exception { + Row[] rowData = new Row[] { + Row.of(1, 1, 1.0), + Row.of(1, 2, 1.0), + Row.of(2, 3, 1.0), + Row.of(3, 4, 1.0), + Row.of(4, 2, 1.0), + Row.of(3, 1, 1.0), + Row.of(2, 4, 1.0), + Row.of(4, 1, 1.0) + }; + BatchOperator data = new TableSourceBatchOp(MLEnvironmentFactory.getDefault().createBatchTable( + Arrays.asList(rowData), + new String[] {"start", "dest", "weight"})); + + BatchOperator triple2anyRes = new TripleToAnyBatchOp( + FormatType.KV, + new Params() + .set(FromTripleParams.TRIPLE_ROW_COL, "start") + .set(FromTripleParams.TRIPLE_COL_COL, "dest") + .set(FromTripleParams.TRIPLE_VAL_COL, "weight") + .set(ToKvParams.KV_COL, "kv") + ).linkFrom(data); + Assert.assertEquals(4, triple2anyRes.collect().size()); + + //new TripleToAnyBatchOp( + // FormatType.COLUMNS, + // new Params() + // .set(TripleToVectorParams.ROW_COL, "start") + // .set(TripleToVectorParams.COLUMN_COL, "dest") + // .set(TripleToVectorParams.VAL_COL, "weight") + // .set(ToColumnsParams.SCHEMA_STR, "f0 double, f1 double, f2 double, f3 double, f4 double") + //).linkFrom(data).lazyPrint(-1); + + BatchOperator triple2anyRes2 = new TripleToAnyBatchOp( + FormatType.VECTOR, + new Params() + .set(FromTripleParams.TRIPLE_ROW_COL, "start") + .set(FromTripleParams.TRIPLE_COL_COL, "dest") + .set(FromTripleParams.TRIPLE_VAL_COL, "weight") + .set(ToVectorParams.VECTOR_COL, "vec") + ).linkFrom(data); + Assert.assertEquals(4, triple2anyRes2.collect().size()); + + } + +} \ No newline at end of file