From dab09f465f5115574bc8be88e0da50cd477e87be Mon Sep 17 00:00:00 2001 From: Gregoire Seguin-Henry Date: Wed, 2 Dec 2020 15:20:45 +0100 Subject: [PATCH 01/17] Implemented URLCleaner processor. --- .../logisland-processor-web-analytics/pom.xml | 7 + .../processor/webAnalytics/URLCleaner.java | 226 ++++++++++++++++++ .../modele/AllQueryParameterRemover.java | 14 ++ .../modele/KeepSomeQueryParameterRemover.java | 32 +++ .../modele/QueryParameterRemover.java | 7 + .../RemoveSomeQueryParameterRemover.java | 32 +++ .../validator/StandardValidators.java | 33 +++ 7 files changed, 351 insertions(+) create mode 100644 logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLCleaner.java create mode 100644 logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AllQueryParameterRemover.java create mode 100644 logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/KeepSomeQueryParameterRemover.java create mode 100644 logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/QueryParameterRemover.java create mode 100644 logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/RemoveSomeQueryParameterRemover.java diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/pom.xml b/logisland-components/logisland-processors/logisland-processor-web-analytics/pom.xml index e33afd09a..2324a7bb1 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/pom.xml +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/pom.xml @@ -69,6 +69,12 @@ true + + org.apache.httpcomponents + httpclient + 4.5.12 + + com.hurence.logisland @@ -77,6 +83,7 @@ test + diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLCleaner.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLCleaner.java new file mode 100644 index 000000000..57181fbeb --- /dev/null +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLCleaner.java @@ -0,0 +1,226 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.processor.webAnalytics; + +import com.hurence.logisland.annotation.behavior.DynamicProperty; +import com.hurence.logisland.annotation.documentation.CapabilityDescription; +import com.hurence.logisland.annotation.documentation.ExtraDetailFile; +import com.hurence.logisland.annotation.documentation.Tags; +import com.hurence.logisland.component.AllowableValue; +import com.hurence.logisland.component.InitializationException; +import com.hurence.logisland.component.PropertyDescriptor; +import com.hurence.logisland.processor.AbstractProcessor; +import com.hurence.logisland.processor.ProcessContext; +import com.hurence.logisland.processor.ProcessError; +import com.hurence.logisland.processor.webAnalytics.modele.AllQueryParameterRemover; +import com.hurence.logisland.processor.webAnalytics.modele.KeepSomeQueryParameterRemover; +import com.hurence.logisland.processor.webAnalytics.modele.QueryParameterRemover; +import com.hurence.logisland.processor.webAnalytics.modele.RemoveSomeQueryParameterRemover; +import com.hurence.logisland.record.FieldType; +import com.hurence.logisland.record.Record; +import com.hurence.logisland.validator.StandardValidators; +import com.hurence.logisland.validator.ValidationContext; +import com.hurence.logisland.validator.ValidationResult; + +import java.net.URISyntaxException; +import java.util.*; + +@Tags({"record", "fields", "Decode"}) +@CapabilityDescription("Decode one or more field containing an URL with possibly special chars encoded\n" + + "...") +@DynamicProperty(name = "fields to decode", + supportsExpressionLanguage = false, + value = "a default value", + description = "Decode one or more fields from the record ") +@ExtraDetailFile("./details/URLDecoder-Detail.rst") +public class URLCleaner extends AbstractProcessor { + + public static final AllowableValue OVERWRITE_EXISTING = + new AllowableValue("overwrite_existing", "overwrite existing field", "if field already exist"); + + public static final AllowableValue KEEP_OLD_FIELD = + new AllowableValue("keep_only_old_field", "keep only old field value", "keep only old field"); + + public static final PropertyDescriptor CONFLICT_RESOLUTION_POLICY = new PropertyDescriptor.Builder() + .name("conflict.resolution.policy") + .description("What to do when a field with the same name already exists ?") + .required(false) + .defaultValue(KEEP_OLD_FIELD.getValue()) + .allowableValues(OVERWRITE_EXISTING, KEEP_OLD_FIELD) + .build(); + + public static final PropertyDescriptor URL_FIELDS = new PropertyDescriptor.Builder() + .name("url.fields") + .description("List of fields (URL) to decode and optionnaly the output field for the url modified. Syntax should be " + + ",,...,. So fields name can not contain ',' nor ':'") + .required(true) + .addValidator(StandardValidators.COMMA_SEPARATED_LIST_COLON_SUB_SEPARATOR_VALIDATOR) + .build(); + + public static final String PARAM_NAMES_INCLUDE_PROP_NAME = "param.names.include"; + public static final String PARAM_NAMES_EXCLUDE_PROP_NAME = "param.names.exclude"; + public static final String REMOVE_ALL_PARAMS_PROP_NAME = "remove.all.params"; + + public static final PropertyDescriptor KEEP_PARAMS = new PropertyDescriptor.Builder() + .name(PARAM_NAMES_INCLUDE_PROP_NAME) + .description("List of param names to keep in the input url (others will be removed). Can not be given at the same time as " + + PARAM_NAMES_EXCLUDE_PROP_NAME + " or " + REMOVE_ALL_PARAMS_PROP_NAME) + .required(false) + .addValidator(StandardValidators.COMMA_SEPARATED_LIST_VALIDATOR) + .build(); + + public static final PropertyDescriptor REMOVE_PARAMS = new PropertyDescriptor.Builder() + .name(PARAM_NAMES_EXCLUDE_PROP_NAME) + .description("List of param names to remove from the input url (others will be kept). Can not be given at the same time as " + + PARAM_NAMES_INCLUDE_PROP_NAME + " or " + REMOVE_ALL_PARAMS_PROP_NAME) + .required(false) + .addValidator(StandardValidators.COMMA_SEPARATED_LIST_VALIDATOR) + .build(); + + public static final PropertyDescriptor REMOVE_ALL_PARAMS = new PropertyDescriptor.Builder() + .name(REMOVE_ALL_PARAMS_PROP_NAME) + .description("Remove all params if true.") + .required(false) + .addValidator(StandardValidators.BOOLEAN_VALIDATOR) + .build(); + + private final Map fieldsToDecodeToOutputField = new HashMap<>(); + private String conflictPolicy; + private QueryParameterRemover remover; + + @Override + public List getSupportedPropertyDescriptors() { + final List descriptors = new ArrayList<>(); + descriptors.add(URL_FIELDS); + descriptors.add(CONFLICT_RESOLUTION_POLICY); + descriptors.add(KEEP_PARAMS); + descriptors.add(REMOVE_PARAMS); + descriptors.add(REMOVE_ALL_PARAMS); + return Collections.unmodifiableList(descriptors); + } + + @Override + protected Collection customValidate(ValidationContext context) { + final List validationResults = new ArrayList<>(super.customValidate(context)); + /** + * Only one of both properties may be set. + */ + if (context.getPropertyValue(REMOVE_ALL_PARAMS).isSet()) { + if (context.getPropertyValue(REMOVE_ALL_PARAMS).asBoolean()) { + if (context.getPropertyValue(KEEP_PARAMS).isSet() || context.getPropertyValue(REMOVE_PARAMS).isSet()) + { + validationResults.add( + new ValidationResult.Builder() + .explanation(KEEP_PARAMS.getName() + " and " + REMOVE_PARAMS.getName() + + " properties are mutually exclusive and can not be set if " + REMOVE_ALL_PARAMS.getName() + " is set to true") + .valid(false) + .build()); + } + } + } + if (context.getPropertyValue(KEEP_PARAMS).isSet() && context.getPropertyValue(REMOVE_PARAMS).isSet()) + { + validationResults.add( + new ValidationResult.Builder() + .explanation(KEEP_PARAMS.getName() + " and " + REMOVE_PARAMS.getName() + + " properties are mutually exclusive so it can not be set both at the same time.") + .valid(false) + .build()); + } + return validationResults; + } + + public void init(ProcessContext context) throws InitializationException { + super.init(context); + initFieldsToDecodeToOutputFiles(context); + this.conflictPolicy = context.getPropertyValue(CONFLICT_RESOLUTION_POLICY).asString(); + initRemover(context); + } + + public void initFieldsToDecodeToOutputFiles(ProcessContext context) { + fieldsToDecodeToOutputField.clear(); + String commaSeparatedFields = context.getPropertyValue(URL_FIELDS).asString(); + String[] fieldsArr = commaSeparatedFields.split("\\s*,\\s*"); + for (String field : fieldsArr) { + if (field.contains(":")) { + String[] fieldPair = field.split("\\s*:\\s*"); + fieldsToDecodeToOutputField.put(fieldPair[0], fieldPair[1]); + } else { + fieldsToDecodeToOutputField.put(field, field); + } + } + } + + public void initRemover(ProcessContext context) throws InitializationException { + if (context.getPropertyValue(KEEP_PARAMS).isSet()) { + String commaSeparatedKeepParams = context.getPropertyValue(KEEP_PARAMS).asString(); + String[] keepParamsArr = commaSeparatedKeepParams.split("\\s*,\\s*"); + final Set keepParams = new HashSet<>(Arrays.asList(keepParamsArr)); + this.remover = new KeepSomeQueryParameterRemover(keepParams); + return; + } + if (context.getPropertyValue(REMOVE_PARAMS).isSet()) { + String commaSeparatedRemoveParam = context.getPropertyValue(REMOVE_PARAMS).asString(); + String[] removeParamsArr = commaSeparatedRemoveParam.split("\\s*,\\s*"); + final Set removeParams = new HashSet<>(Arrays.asList(removeParamsArr)); + this.remover = new RemoveSomeQueryParameterRemover(removeParams); + return; + } + if (context.getPropertyValue(REMOVE_ALL_PARAMS).asBoolean()) { + this.remover = new AllQueryParameterRemover(); + } else { + throw new InitializationException("No remover was built, should never happen !" + + "Problem with configuration checking in processor."); + } + } + + @Override + public Collection process(ProcessContext context, Collection records) { + for (Record record : records) { + updateRecord(record); + } + return records; + } + + + private void updateRecord(Record record) { + fieldsToDecodeToOutputField.entrySet().forEach(kv -> { + tryUpdatingRecord(record, kv); + }); + } + + private void tryUpdatingRecord(Record record, Map.Entry kv) { + String inputFieldName = kv.getKey(); + String outputFieldName = kv.getValue(); + if (record.hasField(inputFieldName)) { + String value = record.getField(inputFieldName).asString();//TODO test if null in field + if (value != null) { + String cleanedUrl = null; + try { + cleanedUrl = remover.removeQueryParameters(value); + } catch (URISyntaxException e) { + getLogger().error("Error for url {}, for record {}.", new Object[]{value, record}, e); + String msg = "Could not parse url : '" + value + "' into URI, for record: '" + record.toString() + "'.\n Cause: " + e.getMessage(); + record.addError(ProcessError.STRING_FORMAT_ERROR.toString(), getLogger(), msg); + return; + } + if (!record.hasField(outputFieldName) || conflictPolicy.equals(OVERWRITE_EXISTING.getValue())) { + record.setField(outputFieldName, FieldType.STRING, cleanedUrl); + } + } + } + } +} diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AllQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AllQueryParameterRemover.java new file mode 100644 index 000000000..26cdb852b --- /dev/null +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AllQueryParameterRemover.java @@ -0,0 +1,14 @@ +package com.hurence.logisland.processor.webAnalytics.modele; + +import org.apache.http.client.utils.URIBuilder; + +import java.net.URISyntaxException; + +public class AllQueryParameterRemover implements QueryParameterRemover { + + public String removeQueryParameters(String url) throws URISyntaxException { + URIBuilder uriBuilder = new URIBuilder(url); + uriBuilder.removeQuery(); + return uriBuilder.build().toString(); + } +} diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/KeepSomeQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/KeepSomeQueryParameterRemover.java new file mode 100644 index 000000000..2e43e92e4 --- /dev/null +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/KeepSomeQueryParameterRemover.java @@ -0,0 +1,32 @@ +package com.hurence.logisland.processor.webAnalytics.modele; + +import org.apache.http.NameValuePair; +import org.apache.http.client.utils.URIBuilder; + +import java.net.URISyntaxException; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +public class KeepSomeQueryParameterRemover implements QueryParameterRemover { + + final Set parameterToKeep; + + public KeepSomeQueryParameterRemover(Set parameterToKeep) { + this.parameterToKeep = parameterToKeep; + } + + public String removeQueryParameters(String url) throws URISyntaxException { + URIBuilder uriBuilder = new URIBuilder(url); + List queryParameters = uriBuilder.getQueryParams() + .stream() + .filter(p -> parameterToKeep.contains(p.getName())) + .collect(Collectors.toList()); + if (queryParameters.isEmpty()) { + uriBuilder.removeQuery(); + } else { + uriBuilder.setParameters(queryParameters); + } + return uriBuilder.build().toString(); + } +} diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/QueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/QueryParameterRemover.java new file mode 100644 index 000000000..ceb5e1b92 --- /dev/null +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/QueryParameterRemover.java @@ -0,0 +1,7 @@ +package com.hurence.logisland.processor.webAnalytics.modele; + +import java.net.URISyntaxException; + +public interface QueryParameterRemover { + String removeQueryParameters(String url) throws URISyntaxException; +} diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/RemoveSomeQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/RemoveSomeQueryParameterRemover.java new file mode 100644 index 000000000..0190be4eb --- /dev/null +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/RemoveSomeQueryParameterRemover.java @@ -0,0 +1,32 @@ +package com.hurence.logisland.processor.webAnalytics.modele; + +import org.apache.http.NameValuePair; +import org.apache.http.client.utils.URIBuilder; + +import java.net.URISyntaxException; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +public class RemoveSomeQueryParameterRemover implements QueryParameterRemover { + + final Set parameterToRemove; + + public RemoveSomeQueryParameterRemover(Set parameterToRemove) { + this.parameterToRemove = parameterToRemove; + } + + public String removeQueryParameters(String url) throws URISyntaxException { + URIBuilder uriBuilder = new URIBuilder(url); + List queryParameters = uriBuilder.getQueryParams() + .stream() + .filter(p -> !parameterToRemove.contains(p.getName())) + .collect(Collectors.toList()); + if (queryParameters.isEmpty()) { + uriBuilder.removeQuery(); + } else { + uriBuilder.setParameters(queryParameters); + } + return uriBuilder.build().toString(); + } +} diff --git a/logisland-core/logisland-api/src/main/java/com/hurence/logisland/validator/StandardValidators.java b/logisland-core/logisland-api/src/main/java/com/hurence/logisland/validator/StandardValidators.java index 7404b72e9..854f9dfc9 100644 --- a/logisland-core/logisland-api/src/main/java/com/hurence/logisland/validator/StandardValidators.java +++ b/logisland-core/logisland-api/src/main/java/com/hurence/logisland/validator/StandardValidators.java @@ -262,6 +262,39 @@ public ValidationResult validate(final String subject, final String value) { } }; + /** + * input should be separated by "," and optionnaly separated again with ":". Sub group should not be more than 2 + * + * Valid example : + * -a,b,v,d + * -a:r,b,v:m,d + * -a:i,b:zzz,v:pp,d:po + * Not Valid example : + * -a:a:a,b,v,d + * -a:b,b:p,v:g:g,d:a + */ + public static final Validator COMMA_SEPARATED_LIST_COLON_SUB_SEPARATOR_VALIDATOR = new Validator() { + @Override + public ValidationResult validate(final String subject, final String value) { + String reason = null; + try { + String[] fieldsArr = value.split("\\s*,\\s*"); + for (String field : fieldsArr) { + if (field.contains(":")) { + String[] fieldPair = field.split("\\s*:\\s*"); + if (fieldPair.length > 2) { + reason = "An element of the list contain several ':'"; + } + } + } + } catch (final Exception e) { + reason = "not a comma separated list"; + } + return new ValidationResult.Builder().subject(subject).input(value).explanation(reason).valid(reason == null).build(); + } + }; + + public static final Validator COMMA_SEPARATED_LIST_VALIDATOR = new Validator() { @Override public ValidationResult validate(final String subject, final String value) { From b24fe198547d47029356a62e4a36fa2e4b5a9915 Mon Sep 17 00:00:00 2001 From: Gregoire Seguin-Henry Date: Wed, 2 Dec 2020 15:21:14 +0100 Subject: [PATCH 02/17] Removed unused code in URLDecoder processor and fixed some log issues. --- .../processor/webAnalytics/URLDecoder.java | 24 +++---------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLDecoder.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLDecoder.java index b153ecece..f8ed6ead9 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLDecoder.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLDecoder.java @@ -35,14 +35,9 @@ @Tags({"record", "fields", "Decode"}) @CapabilityDescription("Decode one or more field containing an URL with possibly special chars encoded\n" + "...") -@DynamicProperty(name = "fields to decode", - supportsExpressionLanguage = false, - value = "a default value", - description = "Decode one or more fields from the record ") @ExtraDetailFile("./details/URLDecoder-Detail.rst") public class URLDecoder extends AbstractProcessor { - private static final Logger logger = LoggerFactory.getLogger(URLDecoder.class); private static final String UTF8_CHARSET = "UTF-8"; private final HashSet fieldsToDecode = new HashSet(); private final static String UTF8_PERCENT_ENCODED_CHAR = "%25"; @@ -71,18 +66,6 @@ public List getSupportedPropertyDescriptors() { return Collections.unmodifiableList(descriptors); } - - @Override - protected PropertyDescriptor getSupportedDynamicPropertyDescriptor(final String propertyDescriptorName) { - return new PropertyDescriptor.Builder() - .name(propertyDescriptorName) - .expressionLanguageSupported(true) - .addValidator(StandardValidators.COMMA_SEPARATED_LIST_VALIDATOR) - .required(false) - .dynamic(true) - .build(); - } - public void init(ProcessContext context) throws InitializationException { super.init(context); String commaSeparatedFields = context.getPropertyValue(FIELDS_TO_DECODE_PROP).asString(); @@ -97,7 +80,7 @@ public void init(ProcessContext context) throws InitializationException { percentEncodedChar = java.net.URLEncoder.encode("%", charset); } catch (UnsupportedEncodingException e1) { percentEncodedChar=UTF8_PERCENT_ENCODED_CHAR; // Default to UTF-8 encoded char - logger.warn(e1.toString()); + getLogger().warn("Error while initializing percentEncodedChar", e1); } } @@ -111,7 +94,6 @@ public Collection process(ProcessContext context, Collection rec private void updateRecord(ProcessContext context, Record record, HashSet fields) { - String charset = context.getPropertyValue(CHARSET_PROP).asString(); if ((fields == null) || fields.isEmpty()) { return; @@ -141,10 +123,10 @@ private void decode (String value, String charset, Record record, String fieldNa decode(value, charset, record, fieldName, false); } else { - logger.warn(e.toString()); + getLogger().warn(e.toString()); } } catch (Exception e){ - logger.warn(e.toString()); + getLogger().warn(e.toString()); } } From a64454d3fec083a9c625b1a7aee0166b99ac7546 Mon Sep 17 00:00:00 2001 From: Gregoire Seguin-Henry Date: Wed, 2 Dec 2020 18:47:17 +0100 Subject: [PATCH 03/17] [WIP] implemented unit test. Fixed some bugs. --- .../processor/webAnalytics/URLCleaner.java | 37 +- .../modele/AbstractQueryParameterRemover.java | 39 ++ .../modele/AllQueryParameterRemover.java | 6 +- .../modele/KeepSomeQueryParameterRemover.java | 53 +- .../modele/QueryParameterRemover.java | 3 +- .../RemoveSomeQueryParameterRemover.java | 7 +- .../webAnalytics/URLCleanerTest.java | 520 ++++++++++++++++++ 7 files changed, 642 insertions(+), 23 deletions(-) create mode 100644 logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AbstractQueryParameterRemover.java create mode 100644 logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLCleanerTest.java diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLCleaner.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLCleaner.java index 57181fbeb..9f17c81e0 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLCleaner.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLCleaner.java @@ -35,6 +35,7 @@ import com.hurence.logisland.validator.ValidationContext; import com.hurence.logisland.validator.ValidationResult; +import java.io.UnsupportedEncodingException; import java.net.URISyntaxException; import java.util.*; @@ -63,31 +64,34 @@ public class URLCleaner extends AbstractProcessor { .build(); public static final PropertyDescriptor URL_FIELDS = new PropertyDescriptor.Builder() - .name("url.fields") - .description("List of fields (URL) to decode and optionnaly the output field for the url modified. Syntax should be " + - ",,...,. So fields name can not contain ',' nor ':'") - .required(true) - .addValidator(StandardValidators.COMMA_SEPARATED_LIST_COLON_SUB_SEPARATOR_VALIDATOR) - .build(); + .name("url.fields") + .description("List of fields (URL) to decode and optionnaly the output field for the url modified. Syntax should be " + + ",,...,. So fields name can not contain ',' nor ':'") + .required(true) + .addValidator(StandardValidators.COMMA_SEPARATED_LIST_COLON_SUB_SEPARATOR_VALIDATOR) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build(); public static final String PARAM_NAMES_INCLUDE_PROP_NAME = "param.names.include"; - public static final String PARAM_NAMES_EXCLUDE_PROP_NAME = "param.names.exclude"; + public static final String REMOVE_PARAMS_PROP_NAME = "param.names.exclude"; public static final String REMOVE_ALL_PARAMS_PROP_NAME = "remove.all.params"; public static final PropertyDescriptor KEEP_PARAMS = new PropertyDescriptor.Builder() .name(PARAM_NAMES_INCLUDE_PROP_NAME) .description("List of param names to keep in the input url (others will be removed). Can not be given at the same time as " + - PARAM_NAMES_EXCLUDE_PROP_NAME + " or " + REMOVE_ALL_PARAMS_PROP_NAME) + REMOVE_PARAMS_PROP_NAME + " or " + REMOVE_ALL_PARAMS_PROP_NAME) .required(false) .addValidator(StandardValidators.COMMA_SEPARATED_LIST_VALIDATOR) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .build(); public static final PropertyDescriptor REMOVE_PARAMS = new PropertyDescriptor.Builder() - .name(PARAM_NAMES_EXCLUDE_PROP_NAME) + .name(REMOVE_PARAMS_PROP_NAME) .description("List of param names to remove from the input url (others will be kept). Can not be given at the same time as " + PARAM_NAMES_INCLUDE_PROP_NAME + " or " + REMOVE_ALL_PARAMS_PROP_NAME) .required(false) .addValidator(StandardValidators.COMMA_SEPARATED_LIST_VALIDATOR) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .build(); public static final PropertyDescriptor REMOVE_ALL_PARAMS = new PropertyDescriptor.Builder() @@ -129,6 +133,15 @@ protected Collection customValidate(ValidationContext context) .valid(false) .build()); } + } else { + if (!context.getPropertyValue(KEEP_PARAMS).isSet() && !context.getPropertyValue(REMOVE_PARAMS).isSet()) { + validationResults.add( + new ValidationResult.Builder() + .explanation(KEEP_PARAMS.getName() + " or " + REMOVE_PARAMS.getName() + + " properties is required when " + REMOVE_ALL_PARAMS.getName() + " is set to false") + .valid(false) + .build()); + } } } if (context.getPropertyValue(KEEP_PARAMS).isSet() && context.getPropertyValue(REMOVE_PARAMS).isSet()) @@ -179,7 +192,7 @@ public void initRemover(ProcessContext context) throws InitializationException { this.remover = new RemoveSomeQueryParameterRemover(removeParams); return; } - if (context.getPropertyValue(REMOVE_ALL_PARAMS).asBoolean()) { + if (!context.getPropertyValue(REMOVE_ALL_PARAMS).isSet() || context.getPropertyValue(REMOVE_ALL_PARAMS).asBoolean()) { this.remover = new AllQueryParameterRemover(); } else { throw new InitializationException("No remover was built, should never happen !" + @@ -211,9 +224,9 @@ private void tryUpdatingRecord(Record record, Map.Entry kv) { String cleanedUrl = null; try { cleanedUrl = remover.removeQueryParameters(value); - } catch (URISyntaxException e) { + } catch (URISyntaxException | UnsupportedEncodingException e) { getLogger().error("Error for url {}, for record {}.", new Object[]{value, record}, e); - String msg = "Could not parse url : '" + value + "' into URI, for record: '" + record.toString() + "'.\n Cause: " + e.getMessage(); + String msg = "Could not parse url : '" + value + "' into URI.\n Cause: " + e.getMessage(); record.addError(ProcessError.STRING_FORMAT_ERROR.toString(), getLogger(), msg); return; } diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AbstractQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AbstractQueryParameterRemover.java new file mode 100644 index 000000000..9dd22bdcb --- /dev/null +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AbstractQueryParameterRemover.java @@ -0,0 +1,39 @@ +package com.hurence.logisland.processor.webAnalytics.modele; + +import org.apache.http.client.utils.URIBuilder; + +import java.io.UnsupportedEncodingException; +import java.net.URI; +import java.net.URISyntaxException; + +public abstract class AbstractQueryParameterRemover implements QueryParameterRemover { + + protected abstract String removeQueryParameters(URIBuilder uriBuilder) throws URISyntaxException; + + public String removeQueryParameters(String url) throws UnsupportedEncodingException, URISyntaxException { + try { + URIBuilder uriBuilder = new URIBuilder(url); + return removeQueryParameters(uriBuilder); + } catch (URISyntaxException e) { + return tryHandlingCaseNotAValidURI(url); + } + } + + protected String tryHandlingCaseNotAValidURI(String urlStr) throws UnsupportedEncodingException, URISyntaxException { + return "toto"; +// try { +// String queryPart = urlStr.substring(urlStr.indexOf("?"), Integer.MAX_VALUE); +// EncodedQueryPart = +// URI uri = new URI(urlStr); +// return "toto"; +// URIBuilder uriBuilder = new URIBuilder(java.net.URLEncoder.encode(urlStr, "UTF-8")); +// String queryString = uri.getQuery(); +// URIBuilder uriBuilder = new URIBuilder(url.toURI()); +// String newUriEncoded = removeQueryParameters(uriBuilder); +// return java.net.URLDecoder.decode(newUriEncoded, "UTF-8"); +// } catch (MalformedURLException e) { +// throw new RuntimeException("dans ton cul", e); +// } + + } +} diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AllQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AllQueryParameterRemover.java index 26cdb852b..fd681937e 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AllQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AllQueryParameterRemover.java @@ -4,10 +4,10 @@ import java.net.URISyntaxException; -public class AllQueryParameterRemover implements QueryParameterRemover { +public class AllQueryParameterRemover extends AbstractQueryParameterRemover implements QueryParameterRemover { - public String removeQueryParameters(String url) throws URISyntaxException { - URIBuilder uriBuilder = new URIBuilder(url); + @Override + protected String removeQueryParameters(URIBuilder uriBuilder) throws URISyntaxException { uriBuilder.removeQuery(); return uriBuilder.build().toString(); } diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/KeepSomeQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/KeepSomeQueryParameterRemover.java index 2e43e92e4..1406f2ca8 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/KeepSomeQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/KeepSomeQueryParameterRemover.java @@ -3,12 +3,14 @@ import org.apache.http.NameValuePair; import org.apache.http.client.utils.URIBuilder; +import java.io.UnsupportedEncodingException; +import java.net.URI; import java.net.URISyntaxException; import java.util.List; import java.util.Set; import java.util.stream.Collectors; -public class KeepSomeQueryParameterRemover implements QueryParameterRemover { +public class KeepSomeQueryParameterRemover extends AbstractQueryParameterRemover implements QueryParameterRemover { final Set parameterToKeep; @@ -16,8 +18,8 @@ public KeepSomeQueryParameterRemover(Set parameterToKeep) { this.parameterToKeep = parameterToKeep; } - public String removeQueryParameters(String url) throws URISyntaxException { - URIBuilder uriBuilder = new URIBuilder(url); + @Override + protected String removeQueryParameters(URIBuilder uriBuilder) throws URISyntaxException { List queryParameters = uriBuilder.getQueryParams() .stream() .filter(p -> parameterToKeep.contains(p.getName())) @@ -27,6 +29,49 @@ public String removeQueryParameters(String url) throws URISyntaxException { } else { uriBuilder.setParameters(queryParameters); } - return uriBuilder.build().toString(); + return uriBuilder.build().toString(); } + +// private void toString(URI uri) { +// StringBuffer sb = new StringBuffer(); +// if (uri.getScheme() != null) { +// sb.append(uri.getScheme()); +// sb.append(':'); +// } +// if (isOpaque()) { +// sb.append(schemeSpecificPart); +// } else { +// if (host != null) { +// sb.append("//"); +// if (userInfo != null) { +// sb.append(userInfo); +// sb.append('@'); +// } +// boolean needBrackets = ((host.indexOf(':') >= 0) +// && !host.startsWith("[") +// && !host.endsWith("]")); +// if (needBrackets) sb.append('['); +// sb.append(host); +// if (needBrackets) sb.append(']'); +// if (port != -1) { +// sb.append(':'); +// sb.append(port); +// } +// } else if (authority != null) { +// sb.append("//"); +// sb.append(authority); +// } +// if (path != null) +// sb.append(path); +// if (query != null) { +// sb.append('?'); +// sb.append(query); +// } +// } +// if (fragment != null) { +// sb.append('#'); +// sb.append(fragment); +// } +// string = sb.toString(); +// } } diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/QueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/QueryParameterRemover.java index ceb5e1b92..99e772ac2 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/QueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/QueryParameterRemover.java @@ -1,7 +1,8 @@ package com.hurence.logisland.processor.webAnalytics.modele; +import java.io.UnsupportedEncodingException; import java.net.URISyntaxException; public interface QueryParameterRemover { - String removeQueryParameters(String url) throws URISyntaxException; + String removeQueryParameters(String url) throws URISyntaxException, UnsupportedEncodingException; } diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/RemoveSomeQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/RemoveSomeQueryParameterRemover.java index 0190be4eb..4491a175d 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/RemoveSomeQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/RemoveSomeQueryParameterRemover.java @@ -3,12 +3,13 @@ import org.apache.http.NameValuePair; import org.apache.http.client.utils.URIBuilder; +import java.io.UnsupportedEncodingException; import java.net.URISyntaxException; import java.util.List; import java.util.Set; import java.util.stream.Collectors; -public class RemoveSomeQueryParameterRemover implements QueryParameterRemover { +public class RemoveSomeQueryParameterRemover extends AbstractQueryParameterRemover implements QueryParameterRemover { final Set parameterToRemove; @@ -16,8 +17,8 @@ public RemoveSomeQueryParameterRemover(Set parameterToRemove) { this.parameterToRemove = parameterToRemove; } - public String removeQueryParameters(String url) throws URISyntaxException { - URIBuilder uriBuilder = new URIBuilder(url); + @Override + protected String removeQueryParameters(URIBuilder uriBuilder) throws URISyntaxException { List queryParameters = uriBuilder.getQueryParams() .stream() .filter(p -> !parameterToRemove.contains(p.getName())) diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLCleanerTest.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLCleanerTest.java new file mode 100644 index 000000000..a1e80edc8 --- /dev/null +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLCleanerTest.java @@ -0,0 +1,520 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.processor.webAnalytics; + +import com.hurence.logisland.record.FieldType; +import com.hurence.logisland.record.Record; +import com.hurence.logisland.record.StandardRecord; +import com.hurence.logisland.util.runner.MockRecord; +import com.hurence.logisland.util.runner.TestRunner; +import com.hurence.logisland.util.runner.TestRunners; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class URLCleanerTest { + + private static final Logger logger = LoggerFactory.getLogger(URLCleanerTest.class); + + private static final String url1 = "https://www.test.com/de/search/?text=toto"; + private static final String expectedUrl1WithoutParams = "https://www.test.com/de/search/"; + private static final String expectedUrl1KeepText = url1; + private static final String expectedUrl1RemoveText = expectedUrl1WithoutParams; + + private static final String url2 = "https://www.t%888est%20.com/de/search/?text=calendrier%20%20%202019"; + private static final String expectedUrl2WithoutParams = "https://www.t%888est%20.com/de/search/"; +// private static final String expectedUrl2KeepText = "https://www.t%888est%20.com/de/search/?text=calendrier+++2019"; + private static final String expectedUrl2KeepText = url2; + private static final String expectedUrl2RemoveText = expectedUrl2WithoutParams; + + private static final String url3 = "https://www.test.com/en/search/?text=key1+%20key2%20+%28key3-key4%29"; + private static final String expectedUrl3WithoutParams = "https://www.test.com/en/search/"; +// private static final String expectedUrl3KeepText = "https://www.test.com/en/search/?text=key1++key2++%28key3-key4%29"; + private static final String expectedUrl3KeepText = url3; + private static final String expectedUrl3RemoveText = expectedUrl3WithoutParams; + + private static final String url4Decoded = "https://www.orexad.com/fr/search?q=sauterelle||relevance&page=2&sort=relevance"; + private static final String expectedUrl4DecodedWithoutParams = "https://www.orexad.com/fr/search"; + private static final String expectedUrl4DecodedKeepQ = "https://www.orexad.com/fr/search?q=sauterelle||relevance"; + private static final String expectedUrl4DecodedKeepPage = "https://www.orexad.com/fr/search?page=2"; + private static final String expectedUrl4DecodedRemoveQ = "https://www.orexad.com/fr/search?page=2&sort=relevance"; + private static final String expectedUrl4DecodedRemovePage = "https://www.orexad.com/fr/search?q=sauterelle||relevance&sort=relevance"; + + private static final String url4 = "https://www.orexad.com/fr/search?q=sauterelle%7C%7Crelevance&page=2&sort=relevance"; + private static final String expectedUrl4WithoutParams = "https://www.orexad.com/fr/search"; + private static final String expectedUrl4KeepQ = "https://www.orexad.com/fr/search?q=sauterelle%7C%7Crelevance"; + private static final String expectedUrl4KeepPage = "https://www.orexad.com/fr/search?page=2"; + private static final String expectedUrl4RemoveQ = "https://www.orexad.com/fr/search?page=2&sort=relevance"; + private static final String expectedUrl4RemovePage = "https://www.orexad.com/fr/search?q=sauterelle%7C%7Crelevance&sort=relevance"; + + private static final String url5 = "https://www.orexad.com/fr/sauterelle-inox-a-crochet-et-attache-a-levier-vertical/r-PR_G1050050542"; + private static final String expectedUrl5WithoutParams = "https://www.orexad.com/fr/sauterelle-inox-a-crochet-et-attache-a-levier-vertical/r-PR_G1050050542"; + + private static final String val1 = "key1+%20key2%20+%28key3-key4%29"; + private static final String expectedVal1WithoutParams = "key1+%20key2%20+%28key3-key4%29"; + + private static final String val2 = "%co"; + private static final String expectedVal5 = "%co"; + private static final String expectedVal2WithoutParams = "%co"; + + private static final String val3 = "%%"; + private static final String expectedVal6 = "%%"; + private static final String expectedVal3WithoutParams = "%%"; + + + + + private Record getRecord1() { + Record record1 = new StandardRecord(); + record1.setField("string1", FieldType.STRING, "value1"); + record1.setField("string2", FieldType.STRING, "value2"); + record1.setField("long1", FieldType.LONG, 1); + record1.setField("long2", FieldType.LONG, 2); + record1.setField("url1", FieldType.STRING, url1); + record1.setField("url2", FieldType.STRING, url2); + record1.setField("url3", FieldType.STRING, url3); + record1.setField("url4", FieldType.STRING, url4); +// record1.setField("url4Decoded", FieldType.STRING, url4Decoded); + record1.setField("url5", FieldType.STRING, url5); + record1.setField("val1", FieldType.STRING, val1); + record1.setField("val2", FieldType.STRING, val2); + record1.setField("val3", FieldType.STRING, val3); + return record1; + } + + @Test + public void testValidity() { + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.assertNotValid(); + testRunner.setProperty(URLCleaner.URL_FIELDS, "string1"); + testRunner.assertValid(); + testRunner.setProperty(URLCleaner.REMOVE_ALL_PARAMS, "false"); + testRunner.assertNotValid(); + testRunner.setProperty(URLCleaner.REMOVE_ALL_PARAMS, "true"); + testRunner.assertValid(); + testRunner.setProperty(URLCleaner.CONFLICT_RESOLUTION_POLICY, URLCleaner.OVERWRITE_EXISTING.getValue()); + testRunner.assertValid(); + testRunner.setProperty(URLCleaner.REMOVE_PARAMS, "a,b"); + testRunner.assertNotValid(); + testRunner.removeProperty(URLCleaner.REMOVE_PARAMS); + testRunner.assertValid(); + testRunner.setProperty(URLCleaner.KEEP_PARAMS, "a,b"); + testRunner.assertNotValid(); + testRunner.removeProperty(URLCleaner.KEEP_PARAMS); + testRunner.assertValid(); + testRunner.removeProperty(URLCleaner.REMOVE_ALL_PARAMS); + testRunner.assertValid(); + testRunner.setProperty(URLCleaner.KEEP_PARAMS, "a,b"); + testRunner.assertValid(); + testRunner.setProperty(URLCleaner.REMOVE_PARAMS, "a,b"); + testRunner.assertNotValid(); + testRunner.removeProperty(URLCleaner.KEEP_PARAMS); + testRunner.assertValid(); + testRunner.setProperty(URLCleaner.REMOVE_PARAMS, ""); + testRunner.assertNotValid(); + testRunner.removeProperty(URLCleaner.REMOVE_PARAMS); + testRunner.assertValid(); + testRunner.setProperty(URLCleaner.URL_FIELDS, ""); + testRunner.assertNotValid(); + testRunner.setProperty(URLCleaner.URL_FIELDS, "a:g:v"); + testRunner.assertNotValid(); + testRunner.setProperty(URLCleaner.URL_FIELDS, "a:b,g:b,t"); + testRunner.assertValid(); + testRunner.setProperty(URLCleaner.URL_FIELDS, "b,g:b,t"); + testRunner.assertValid(); + testRunner.setProperty(URLCleaner.URL_FIELDS, "a,g,t"); + testRunner.assertValid(); + testRunner.setProperty(URLCleaner.URL_FIELDS, "a,b,g:b:h,t"); + testRunner.assertNotValid(); + } + + @Test + public void testNoURLValueField() { + Record record1 = getRecord1(); + int inputSize = record1.size(); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, "string1:newstring1"); + testRunner.assertValid(); + testRunner.enqueue(record1); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(inputSize + 1); + out.assertFieldEquals("string1", "value1"); + out.assertFieldTypeEquals("string1", FieldType.STRING); + out.assertFieldEquals("newstring1", "value1"); + out.assertFieldTypeEquals("newstring1", FieldType.STRING); + } + + @Test + public void testNoURLValueField2() { + Record record1 = getRecord1(); + int inputSize = record1.size(); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, "string1"); + testRunner.assertValid(); + testRunner.enqueue(record1); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(inputSize); + out.assertFieldEquals("string1", "value1"); + out.assertFieldTypeEquals("string1", FieldType.STRING); + } + + @Test + public void testBasicURLValueField() { + Record record1 = getRecord1(); + int inputSize = record1.size(); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, "string1:newstring1,url1: new_url1"); + testRunner.assertValid(); + testRunner.enqueue(record1); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(inputSize + 2); + out.assertFieldEquals("string1", "value1"); + out.assertFieldTypeEquals("string1", FieldType.STRING); + out.assertFieldEquals("newstring1", "value1"); + out.assertFieldTypeEquals("newstring1", FieldType.STRING); + out.assertFieldEquals("url1", url1); + out.assertFieldTypeEquals("url1", FieldType.STRING); + out.assertFieldEquals("new_url1", expectedUrl1WithoutParams); + out.assertFieldTypeEquals("new_url1", FieldType.STRING); + } + + @Test + public void testBasicURLValueField2() { + Record record1 = getRecord1(); + int inputSize = record1.size(); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, "string1,url1"); + testRunner.assertValid(); + testRunner.enqueue(record1); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(inputSize); + out.assertFieldEquals("string1", "value1"); + out.assertFieldTypeEquals("string1", FieldType.STRING); + out.assertFieldEquals("url1", url1); + out.assertFieldTypeEquals("url1", FieldType.STRING); + } + + @Test + public void testEncodedURLValueField() { + Record record1 = getRecord1(); + int inputSize = record1.size(); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, "url2,url1"); + testRunner.setProperty(URLCleaner.CONFLICT_RESOLUTION_POLICY, URLCleaner.OVERWRITE_EXISTING); + testRunner.assertValid(); + testRunner.enqueue(record1); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(inputSize); + out.assertFieldEquals("url2", expectedUrl2WithoutParams); + out.assertFieldTypeEquals("url2", FieldType.STRING); + out.assertFieldEquals("url1", expectedUrl1WithoutParams); + out.assertFieldTypeEquals("url1", FieldType.STRING); + } + + + @Test + public void testComplexField() { + Record record1 = getRecord1(); + int inputSize = record1.size(); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, "url3,url4,url5,val1"); + testRunner.setProperty(URLCleaner.CONFLICT_RESOLUTION_POLICY, URLCleaner.OVERWRITE_EXISTING); + testRunner.assertValid(); + testRunner.enqueue(record1); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(inputSize); + out.assertFieldEquals("url3", expectedUrl3WithoutParams); + out.assertFieldEquals("url4", expectedUrl4WithoutParams); + out.assertFieldEquals("url5", expectedUrl5WithoutParams); + out.assertFieldEquals("val1", expectedVal1WithoutParams); + } + + @Test + public void testWithAlreadyDecodedUrls() { + Record record1 = getRecord1(); + int inputSize = record1.size(); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, "url4,url4Decoded,val2,val3"); + testRunner.setProperty(URLCleaner.CONFLICT_RESOLUTION_POLICY, URLCleaner.OVERWRITE_EXISTING); + testRunner.assertValid(); + testRunner.enqueue(record1); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord errorRecord = testRunner.getOutputRecords().get(0); + errorRecord.assertRecordSizeEquals(inputSize); + errorRecord.assertFieldEquals("url4", expectedUrl4WithoutParams); + errorRecord.assertFieldEquals("url4Decoded", url4Decoded); + errorRecord.assertFieldEquals("val2", val2); + errorRecord.assertFieldEquals("val3", val3); + } + + @Test + public void testNoMatchingField() { + Record record1 = getRecord1(); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, "nonExistingField"); + testRunner.setProperty(URLCleaner.CONFLICT_RESOLUTION_POLICY, URLCleaner.OVERWRITE_EXISTING); + testRunner.assertValid(); + testRunner.enqueue(record1); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(record1.size()); + out.assertContentEquals(record1); + } + + + @Test + public void testPercentButNotHexaField() { + Record record = getRecord1(); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, "val2,val3"); + testRunner.setProperty(URLCleaner.CONFLICT_RESOLUTION_POLICY, URLCleaner.OVERWRITE_EXISTING); + testRunner.assertValid(); + testRunner.enqueue(record); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(record.size()); + out.assertFieldEquals("val2", val2); + out.assertFieldEquals("val3", val3); + } + + @Test + public void testKeepText() { + Record record = getRecord1(); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, "url1,url2,url3,url4,url4Decoded"); + testRunner.setProperty(URLCleaner.CONFLICT_RESOLUTION_POLICY, URLCleaner.OVERWRITE_EXISTING); + testRunner.setProperty(URLCleaner.KEEP_PARAMS, "text"); + testRunner.assertValid(); + testRunner.enqueue(record); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(record.size()); + out.assertFieldEquals("url1", expectedUrl1KeepText); + out.assertFieldEquals("url2", expectedUrl2KeepText); + out.assertFieldEquals("url3", expectedUrl3KeepText); + out.assertFieldEquals("url4", expectedUrl4WithoutParams); +// out.assertFieldEquals("url4Decoded", expectedUrl4DecodedWithoutParams); + } + + @Test + public void testKeepQ() { + Record record = getRecord1(); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, "url1,url2,url3,url4,url4Decoded"); + testRunner.setProperty(URLCleaner.CONFLICT_RESOLUTION_POLICY, URLCleaner.OVERWRITE_EXISTING); + testRunner.setProperty(URLCleaner.KEEP_PARAMS, "q"); + testRunner.assertValid(); + testRunner.enqueue(record); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(record.size()); + out.assertFieldEquals("url1", expectedUrl1WithoutParams); + out.assertFieldEquals("url2", expectedUrl2WithoutParams); + out.assertFieldEquals("url3", expectedUrl3WithoutParams); + out.assertFieldEquals("url4", expectedUrl4KeepQ); +// out.assertFieldEquals("url4Decoded", expectedUrl4DecodedKeepQ); + } + + @Test + public void testKeepPage() { + Record record = getRecord1(); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, "url1,url2,url3,url4,url4Decoded"); + testRunner.setProperty(URLCleaner.CONFLICT_RESOLUTION_POLICY, URLCleaner.OVERWRITE_EXISTING); + testRunner.setProperty(URLCleaner.KEEP_PARAMS, "page"); + testRunner.assertValid(); + testRunner.enqueue(record); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(record.size()); + out.assertFieldEquals("url1", expectedUrl1WithoutParams); + out.assertFieldEquals("url2", expectedUrl2WithoutParams); + out.assertFieldEquals("url3", expectedUrl3WithoutParams); + out.assertFieldEquals("url4", expectedUrl4KeepPage); +// out.assertFieldEquals("url4Decoded", expectedUrl4DecodedKeepPage); + } + + @Test + public void testKeepTextAndQ() { + Record record = getRecord1(); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, "url1,url2,url3,url4,url4Decoded"); + testRunner.setProperty(URLCleaner.CONFLICT_RESOLUTION_POLICY, URLCleaner.OVERWRITE_EXISTING); + testRunner.setProperty(URLCleaner.KEEP_PARAMS, "text, q"); + testRunner.assertValid(); + testRunner.enqueue(record); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(record.size()); + out.assertFieldEquals("url1", expectedUrl1KeepText); + out.assertFieldEquals("url2", expectedUrl2KeepText); + out.assertFieldEquals("url3", expectedUrl3KeepText); + out.assertFieldEquals("url4", expectedUrl4KeepQ); +// out.assertFieldEquals("url4Decoded", expectedUrl4DecodedKeepPage); + } + + + @Test + public void testRemoveText() { + Record record = getRecord1(); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, "url1,url2,url3,url4,url4Decoded"); + testRunner.setProperty(URLCleaner.CONFLICT_RESOLUTION_POLICY, URLCleaner.OVERWRITE_EXISTING); + testRunner.setProperty(URLCleaner.REMOVE_PARAMS, "text"); + testRunner.assertValid(); + testRunner.enqueue(record); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(record.size()); + out.assertFieldEquals("url1", expectedUrl1RemoveText); + out.assertFieldEquals("url2", expectedUrl2RemoveText); + out.assertFieldEquals("url3", expectedUrl3RemoveText); + out.assertFieldEquals("url4", url4); +// out.assertFieldEquals("url4Decoded", expectedUrl4DecodedWithoutParams); + } + + @Test + public void testRemoveQ() { + Record record = getRecord1(); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, "url1,url2,url3,url4,url4Decoded"); + testRunner.setProperty(URLCleaner.CONFLICT_RESOLUTION_POLICY, URLCleaner.OVERWRITE_EXISTING); + testRunner.setProperty(URLCleaner.REMOVE_PARAMS, "q"); + testRunner.assertValid(); + testRunner.enqueue(record); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(record.size()); + out.assertFieldEquals("url1", expectedUrl1KeepText); + out.assertFieldEquals("url2", expectedUrl2KeepText); + out.assertFieldEquals("url3", expectedUrl3KeepText); + out.assertFieldEquals("url4", expectedUrl4RemoveQ); +// out.assertFieldEquals("url4Decoded", expectedUrl4DecodedRemoveQ); + } + + @Test + public void testRemovePage() { + Record record = getRecord1(); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, "url1,url2,url3,url4,url4Decoded"); + testRunner.setProperty(URLCleaner.CONFLICT_RESOLUTION_POLICY, URLCleaner.OVERWRITE_EXISTING); + testRunner.setProperty(URLCleaner.REMOVE_PARAMS, "page"); + testRunner.assertValid(); + testRunner.enqueue(record); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(record.size()); + out.assertFieldEquals("url1", expectedUrl1KeepText); + out.assertFieldEquals("url2", expectedUrl2KeepText); + out.assertFieldEquals("url3", expectedUrl3KeepText); + out.assertFieldEquals("url4", expectedUrl4RemovePage); +// out.assertFieldEquals("url4Decoded", expectedUrl4DecodedRemovePage); + } + + @Test + public void testRemoveTextAndQ() { + Record record = getRecord1(); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, "url1,url2,url3,url4,url4Decoded"); + testRunner.setProperty(URLCleaner.CONFLICT_RESOLUTION_POLICY, URLCleaner.OVERWRITE_EXISTING); + testRunner.setProperty(URLCleaner.REMOVE_PARAMS, "text, q"); + testRunner.assertValid(); + testRunner.enqueue(record); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(record.size()); + out.assertFieldEquals("url1", expectedUrl1RemoveText); + out.assertFieldEquals("url2", expectedUrl2RemoveText); + out.assertFieldEquals("url3", expectedUrl3RemoveText); + out.assertFieldEquals("url4", expectedUrl4RemovePage); +// out.assertFieldEquals("url4Decoded", expectedUrl4DecodedRemovePage); + } + +} From 75bb2ef4c4665bee0eb4c0b7835eaffcf6debb8a Mon Sep 17 00:00:00 2001 From: Gregoire Seguin-Henry Date: Thu, 3 Dec 2020 11:44:57 +0100 Subject: [PATCH 04/17] Changed the way to remove params. Does not use URIBuilder anymore as it decode the query params values. If we want to use URIBuilder we should use it at the same time we decode the URI. --- .../modele/AbstractQueryParameterRemover.java | 37 ++++---- .../modele/AllQueryParameterRemover.java | 8 ++ .../modele/KeepSomeQueryParameterRemover.java | 75 +++++++--------- .../RemoveSomeQueryParameterRemover.java | 32 ++++++- .../webAnalytics/modele/SplittedURI.java | 67 +++++++++++++++ .../webAnalytics/URLCleanerTest.java | 86 ++++++++++++++++--- 6 files changed, 225 insertions(+), 80 deletions(-) create mode 100644 logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/SplittedURI.java diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AbstractQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AbstractQueryParameterRemover.java index 9dd22bdcb..4ccd02cb6 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AbstractQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AbstractQueryParameterRemover.java @@ -5,35 +5,28 @@ import java.io.UnsupportedEncodingException; import java.net.URI; import java.net.URISyntaxException; +import java.util.Arrays; public abstract class AbstractQueryParameterRemover implements QueryParameterRemover { protected abstract String removeQueryParameters(URIBuilder uriBuilder) throws URISyntaxException; public String removeQueryParameters(String url) throws UnsupportedEncodingException, URISyntaxException { - try { - URIBuilder uriBuilder = new URIBuilder(url); - return removeQueryParameters(uriBuilder); - } catch (URISyntaxException e) { - return tryHandlingCaseNotAValidURI(url); - } - } - - protected String tryHandlingCaseNotAValidURI(String urlStr) throws UnsupportedEncodingException, URISyntaxException { - return "toto"; + return tryHandlingCaseNotAValidURI(url); // try { -// String queryPart = urlStr.substring(urlStr.indexOf("?"), Integer.MAX_VALUE); -// EncodedQueryPart = -// URI uri = new URI(urlStr); -// return "toto"; -// URIBuilder uriBuilder = new URIBuilder(java.net.URLEncoder.encode(urlStr, "UTF-8")); -// String queryString = uri.getQuery(); -// URIBuilder uriBuilder = new URIBuilder(url.toURI()); -// String newUriEncoded = removeQueryParameters(uriBuilder); -// return java.net.URLDecoder.decode(newUriEncoded, "UTF-8"); -// } catch (MalformedURLException e) { -// throw new RuntimeException("dans ton cul", e); +// URIBuilder uriBuilder = new URIBuilder(url); +// return removeQueryParameters(uriBuilder); +// } catch (URISyntaxException e) { +// return tryHandlingCaseNotAValidURI(url); // } - } + + /** + * If input is not a valid URI, this may be because the URI has already been decoded. + * @param urlStr + * @return + * @throws UnsupportedEncodingException + * @throws URISyntaxException + */ + protected abstract String tryHandlingCaseNotAValidURI(String urlStr) throws UnsupportedEncodingException, URISyntaxException; } diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AllQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AllQueryParameterRemover.java index fd681937e..ee4ee6c23 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AllQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AllQueryParameterRemover.java @@ -2,7 +2,9 @@ import org.apache.http.client.utils.URIBuilder; +import java.io.UnsupportedEncodingException; import java.net.URISyntaxException; +import java.util.Arrays; public class AllQueryParameterRemover extends AbstractQueryParameterRemover implements QueryParameterRemover { @@ -11,4 +13,10 @@ protected String removeQueryParameters(URIBuilder uriBuilder) throws URISyntaxEx uriBuilder.removeQuery(); return uriBuilder.build().toString(); } + + @Override + protected String tryHandlingCaseNotAValidURI(String urlStr) throws UnsupportedEncodingException, URISyntaxException { + SplittedURI guessSplittedURI = SplittedURI.fromMalFormedURI(urlStr); + return guessSplittedURI.getBeforeQueryWithoutQuestionMark() + guessSplittedURI.getAfterQuery(); + } } diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/KeepSomeQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/KeepSomeQueryParameterRemover.java index 1406f2ca8..96bd2bf3f 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/KeepSomeQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/KeepSomeQueryParameterRemover.java @@ -1,13 +1,13 @@ package com.hurence.logisland.processor.webAnalytics.modele; +import org.apache.avro.reflect.MapEntry; import org.apache.http.NameValuePair; import org.apache.http.client.utils.URIBuilder; import java.io.UnsupportedEncodingException; import java.net.URI; import java.net.URISyntaxException; -import java.util.List; -import java.util.Set; +import java.util.*; import java.util.stream.Collectors; public class KeepSomeQueryParameterRemover extends AbstractQueryParameterRemover implements QueryParameterRemover { @@ -32,46 +32,33 @@ protected String removeQueryParameters(URIBuilder uriBuilder) throws URISyntaxEx return uriBuilder.build().toString(); } -// private void toString(URI uri) { -// StringBuffer sb = new StringBuffer(); -// if (uri.getScheme() != null) { -// sb.append(uri.getScheme()); -// sb.append(':'); -// } -// if (isOpaque()) { -// sb.append(schemeSpecificPart); -// } else { -// if (host != null) { -// sb.append("//"); -// if (userInfo != null) { -// sb.append(userInfo); -// sb.append('@'); -// } -// boolean needBrackets = ((host.indexOf(':') >= 0) -// && !host.startsWith("[") -// && !host.endsWith("]")); -// if (needBrackets) sb.append('['); -// sb.append(host); -// if (needBrackets) sb.append(']'); -// if (port != -1) { -// sb.append(':'); -// sb.append(port); -// } -// } else if (authority != null) { -// sb.append("//"); -// sb.append(authority); -// } -// if (path != null) -// sb.append(path); -// if (query != null) { -// sb.append('?'); -// sb.append(query); -// } -// } -// if (fragment != null) { -// sb.append('#'); -// sb.append(fragment); -// } -// string = sb.toString(); -// } + @Override + protected String tryHandlingCaseNotAValidURI(String urlStr) throws UnsupportedEncodingException, URISyntaxException { + SplittedURI guessSplittedURI = SplittedURI.fromMalFormedURI(urlStr); + Map paramsNameValue = Arrays.stream(guessSplittedURI.getQuery().split("&")) + .map(queryString -> queryString.split("=")) + .collect(Collectors.toMap( + keyValueArr -> keyValueArr[0], + keyValueArr -> { + String[] values = Arrays.copyOfRange(keyValueArr, 1, keyValueArr.length); + return String.join("", values); + }, + (x, y) -> y, + LinkedHashMap::new + )); + + List> paramsNameValueFiltred = paramsNameValue.entrySet().stream() + .filter(p -> parameterToKeep.contains(p.getKey())) + .collect(Collectors.toList()); + if (paramsNameValueFiltred.isEmpty()) { + return guessSplittedURI.getBeforeQueryWithoutQuestionMark() + guessSplittedURI.getAfterQuery(); + } else { + String newQueryString = paramsNameValueFiltred.stream() + .map(entry -> entry.getKey() + "=" + entry.getValue()) + .collect(Collectors.joining("&")); + return guessSplittedURI.getBeforeQuery() + + newQueryString + + guessSplittedURI.getAfterQuery(); + } + } } diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/RemoveSomeQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/RemoveSomeQueryParameterRemover.java index 4491a175d..939d326e7 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/RemoveSomeQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/RemoveSomeQueryParameterRemover.java @@ -5,8 +5,7 @@ import java.io.UnsupportedEncodingException; import java.net.URISyntaxException; -import java.util.List; -import java.util.Set; +import java.util.*; import java.util.stream.Collectors; public class RemoveSomeQueryParameterRemover extends AbstractQueryParameterRemover implements QueryParameterRemover { @@ -30,4 +29,33 @@ protected String removeQueryParameters(URIBuilder uriBuilder) throws URISyntaxEx } return uriBuilder.build().toString(); } + + @Override + protected String tryHandlingCaseNotAValidURI(String urlStr) throws UnsupportedEncodingException, URISyntaxException { + SplittedURI guessSplittedURI = SplittedURI.fromMalFormedURI(urlStr); + Map paramsNameValue = Arrays.stream(guessSplittedURI.getQuery().split("&")) + .map(queryString -> queryString.split("=")) + .collect(Collectors.toMap( + keyValueArr -> keyValueArr[0], + keyValueArr -> { + String[] values = Arrays.copyOfRange(keyValueArr, 1, keyValueArr.length); + return String.join("", values); + }, + (x, y) -> y, + LinkedHashMap::new + )); + List> paramsNameValueFiltred = paramsNameValue.entrySet().stream() + .filter(p -> !parameterToRemove.contains(p.getKey())) + .collect(Collectors.toList()); + if (paramsNameValueFiltred.isEmpty()) { + return guessSplittedURI.getBeforeQueryWithoutQuestionMark() + guessSplittedURI.getAfterQuery(); + } else { + String newQueryString = paramsNameValueFiltred.stream() + .map(entry -> entry.getKey() + "=" + entry.getValue()) + .collect(Collectors.joining("&")); + return guessSplittedURI.getBeforeQuery() + + newQueryString + + guessSplittedURI.getAfterQuery(); + } + } } diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/SplittedURI.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/SplittedURI.java new file mode 100644 index 000000000..42dcc40fb --- /dev/null +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/SplittedURI.java @@ -0,0 +1,67 @@ +package com.hurence.logisland.processor.webAnalytics.modele; + +public class SplittedURI { + private String beforeQuery; + private String query; + private String afterQuery; + + private SplittedURI(String beforeQuery, String query, String afterQuery) { + this.beforeQuery = beforeQuery; + this.query = query; + this.afterQuery = afterQuery; + } + + public String getBeforeQuery() { + return beforeQuery; + } + + public String getBeforeQueryWithoutQuestionMark() { + if (beforeQuery.isEmpty()) return beforeQuery; + char lastChar = beforeQuery.charAt(beforeQuery.length() - 1); + if (lastChar == '?') return beforeQuery.substring(0, beforeQuery.length() - 1); + return beforeQuery; + } + + public String getQuery() { + return query; + } + + public String getAfterQuery() { + return afterQuery; + } + + public static SplittedURI fromMalFormedURI(String malformedUri) { + if (malformedUri.isEmpty()) return new SplittedURI("", "", ""); + //select from first ? to # or end + String beforeQueryString = ""; + String queryString = ""; + String afterQueryString = ""; + int indexStr = 0; + int urlSize = malformedUri.length(); + char currentChar = malformedUri.charAt(0); + while(currentChar != '?' && currentChar != '#') { + beforeQueryString += currentChar; + indexStr++; + if (indexStr >= urlSize) return new SplittedURI(beforeQueryString, queryString, afterQueryString); + currentChar=malformedUri.charAt(indexStr); + } + if (currentChar == '?') { + beforeQueryString += currentChar; + indexStr++; + if (indexStr >= urlSize) return new SplittedURI(beforeQueryString, queryString, afterQueryString); + currentChar=malformedUri.charAt(indexStr); + while(currentChar != '#') { + queryString += currentChar; + indexStr++; + if (indexStr >= urlSize) return new SplittedURI(beforeQueryString, queryString, afterQueryString); + currentChar=malformedUri.charAt(indexStr); + } + } + while(true) { + afterQueryString += currentChar; + indexStr++; + if (indexStr >= urlSize) return new SplittedURI(beforeQueryString, queryString, afterQueryString); + currentChar=malformedUri.charAt(indexStr); + } + } +} diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLCleanerTest.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLCleanerTest.java index a1e80edc8..6176fd659 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLCleanerTest.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLCleanerTest.java @@ -15,16 +15,23 @@ */ package com.hurence.logisland.processor.webAnalytics; +import com.hurence.logisland.processor.webAnalytics.modele.SplittedURI; import com.hurence.logisland.record.FieldType; import com.hurence.logisland.record.Record; import com.hurence.logisland.record.StandardRecord; import com.hurence.logisland.util.runner.MockRecord; import com.hurence.logisland.util.runner.TestRunner; import com.hurence.logisland.util.runner.TestRunners; +import org.junit.Assert; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.UnsupportedEncodingException; +import java.net.URISyntaxException; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; + public class URLCleanerTest { private static final Logger logger = LoggerFactory.getLogger(URLCleanerTest.class); @@ -74,7 +81,38 @@ public class URLCleanerTest { private static final String expectedVal6 = "%%"; private static final String expectedVal3WithoutParams = "%%"; - + @Test + public void testUriBuilder() throws URISyntaxException { + SplittedURI url = SplittedURI.fromMalFormedURI("https://www.orexad.com/fr/search?q=sauterelle%7C%7Crelevance&page=2&sort=relevance"); +// private static final String url4 = "https://www.orexad.com/fr/search?q=sauterelle%7C%7Crelevance&page=2&sort=relevance"; + Assert.assertEquals("https://www.orexad.com/fr/search?", url.getBeforeQuery()); + Assert.assertEquals("q=sauterelle%7C%7Crelevance&page=2&sort=relevance", url.getQuery()); + Assert.assertEquals("", url.getAfterQuery()); + + SplittedURI urlDecoded = SplittedURI.fromMalFormedURI("https://www.orexad.com/fr/search?q=sauterelle||relevance&page=2&sort=relevance"); +// private static final String url4Decoded = "https://www.orexad.com/fr/search?q=sauterelle||relevance&page=2&sort=relevance"; + Assert.assertEquals("https://www.orexad.com/fr/search?", urlDecoded.getBeforeQuery()); + Assert.assertEquals("q=sauterelle||relevance&page=2&sort=relevance", urlDecoded.getQuery()); + Assert.assertEquals("", urlDecoded.getAfterQuery()); + + SplittedURI urlWithHashTag = SplittedURI.fromMalFormedURI("https://www.orexad.com/fr/search?q=sauterelle||relevance&page=2&sort=relevance#myTitle"); +// private static final String url4Decoded = "https://www.orexad.com/fr/search?q=sauterelle||relevance&page=2&sort=relevance"; + Assert.assertEquals("https://www.orexad.com/fr/search?", urlWithHashTag.getBeforeQuery()); + Assert.assertEquals("q=sauterelle||relevance&page=2&sort=relevance", urlWithHashTag.getQuery()); + Assert.assertEquals("#myTitle", urlWithHashTag.getAfterQuery()); + + SplittedURI simpleUrl = SplittedURI.fromMalFormedURI("https://www.orexad.com/fr/"); +// private static final String url4Decoded = "https://www.orexad.com/fr/search?q=sauterelle||relevance&page=2&sort=relevance"; + Assert.assertEquals("https://www.orexad.com/fr/", simpleUrl.getBeforeQuery()); + Assert.assertEquals("", simpleUrl.getQuery()); + Assert.assertEquals("", simpleUrl.getAfterQuery()); + + SplittedURI simpleUrlWithFragment = SplittedURI.fromMalFormedURI("https://www.orexad.com/fr/#gggg"); +// private static final String url4Decoded = "https://www.orexad.com/fr/search?q=sauterelle||relevance&page=2&sort=relevance"; + Assert.assertEquals("https://www.orexad.com/fr/", simpleUrlWithFragment.getBeforeQuery()); + Assert.assertEquals("", simpleUrlWithFragment.getQuery()); + Assert.assertEquals("#gggg", simpleUrlWithFragment.getAfterQuery()); + } private Record getRecord1() { @@ -87,7 +125,7 @@ private Record getRecord1() { record1.setField("url2", FieldType.STRING, url2); record1.setField("url3", FieldType.STRING, url3); record1.setField("url4", FieldType.STRING, url4); -// record1.setField("url4Decoded", FieldType.STRING, url4Decoded); + record1.setField("url4Decoded", FieldType.STRING, url4Decoded); record1.setField("url5", FieldType.STRING, url5); record1.setField("val1", FieldType.STRING, val1); record1.setField("val2", FieldType.STRING, val2); @@ -289,9 +327,9 @@ public void testWithAlreadyDecodedUrls() { MockRecord errorRecord = testRunner.getOutputRecords().get(0); errorRecord.assertRecordSizeEquals(inputSize); errorRecord.assertFieldEquals("url4", expectedUrl4WithoutParams); - errorRecord.assertFieldEquals("url4Decoded", url4Decoded); errorRecord.assertFieldEquals("val2", val2); errorRecord.assertFieldEquals("val3", val3); + errorRecord.assertFieldEquals("url4Decoded", expectedUrl4DecodedWithoutParams); } @Test @@ -352,7 +390,7 @@ public void testKeepText() { out.assertFieldEquals("url2", expectedUrl2KeepText); out.assertFieldEquals("url3", expectedUrl3KeepText); out.assertFieldEquals("url4", expectedUrl4WithoutParams); -// out.assertFieldEquals("url4Decoded", expectedUrl4DecodedWithoutParams); + out.assertFieldEquals("url4Decoded", expectedUrl4DecodedWithoutParams); } @Test @@ -375,7 +413,7 @@ public void testKeepQ() { out.assertFieldEquals("url2", expectedUrl2WithoutParams); out.assertFieldEquals("url3", expectedUrl3WithoutParams); out.assertFieldEquals("url4", expectedUrl4KeepQ); -// out.assertFieldEquals("url4Decoded", expectedUrl4DecodedKeepQ); + out.assertFieldEquals("url4Decoded", expectedUrl4DecodedKeepQ); } @Test @@ -398,7 +436,7 @@ public void testKeepPage() { out.assertFieldEquals("url2", expectedUrl2WithoutParams); out.assertFieldEquals("url3", expectedUrl3WithoutParams); out.assertFieldEquals("url4", expectedUrl4KeepPage); -// out.assertFieldEquals("url4Decoded", expectedUrl4DecodedKeepPage); + out.assertFieldEquals("url4Decoded", expectedUrl4DecodedKeepPage); } @Test @@ -421,7 +459,7 @@ public void testKeepTextAndQ() { out.assertFieldEquals("url2", expectedUrl2KeepText); out.assertFieldEquals("url3", expectedUrl3KeepText); out.assertFieldEquals("url4", expectedUrl4KeepQ); -// out.assertFieldEquals("url4Decoded", expectedUrl4DecodedKeepPage); + out.assertFieldEquals("url4Decoded", expectedUrl4DecodedKeepQ); } @@ -445,7 +483,7 @@ public void testRemoveText() { out.assertFieldEquals("url2", expectedUrl2RemoveText); out.assertFieldEquals("url3", expectedUrl3RemoveText); out.assertFieldEquals("url4", url4); -// out.assertFieldEquals("url4Decoded", expectedUrl4DecodedWithoutParams); + out.assertFieldEquals("url4Decoded", url4Decoded); } @Test @@ -468,7 +506,7 @@ public void testRemoveQ() { out.assertFieldEquals("url2", expectedUrl2KeepText); out.assertFieldEquals("url3", expectedUrl3KeepText); out.assertFieldEquals("url4", expectedUrl4RemoveQ); -// out.assertFieldEquals("url4Decoded", expectedUrl4DecodedRemoveQ); + out.assertFieldEquals("url4Decoded", expectedUrl4DecodedRemoveQ); } @Test @@ -491,7 +529,7 @@ public void testRemovePage() { out.assertFieldEquals("url2", expectedUrl2KeepText); out.assertFieldEquals("url3", expectedUrl3KeepText); out.assertFieldEquals("url4", expectedUrl4RemovePage); -// out.assertFieldEquals("url4Decoded", expectedUrl4DecodedRemovePage); + out.assertFieldEquals("url4Decoded", expectedUrl4DecodedRemovePage); } @Test @@ -513,8 +551,32 @@ public void testRemoveTextAndQ() { out.assertFieldEquals("url1", expectedUrl1RemoveText); out.assertFieldEquals("url2", expectedUrl2RemoveText); out.assertFieldEquals("url3", expectedUrl3RemoveText); - out.assertFieldEquals("url4", expectedUrl4RemovePage); -// out.assertFieldEquals("url4Decoded", expectedUrl4DecodedRemovePage); + out.assertFieldEquals("url4", expectedUrl4RemoveQ); + out.assertFieldEquals("url4Decoded", expectedUrl4DecodedRemoveQ); + } + + @Test + public void specialTestDecodedQueryValuesInUrl() throws UnsupportedEncodingException { + Record record = new StandardRecord(); + record.setField("url", FieldType.STRING, "https://mydomain.com/my/path/to/file.html?a=kaka&b=robl&oc&h=on#myfragment"); + record.setField("urlEncode", FieldType.STRING, "https://mydomain.com/my/path/to/file.html?a=kaka&b=" + URLEncoder.encode("robl&oc", StandardCharsets.UTF_8.toString()) + "&h=on#myfragment"); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, "url, urlEncode"); + testRunner.setProperty(URLCleaner.CONFLICT_RESOLUTION_POLICY, URLCleaner.OVERWRITE_EXISTING); + testRunner.setProperty(URLCleaner.REMOVE_PARAMS, "b"); + testRunner.assertValid(); + testRunner.enqueue(record); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(record.size()); + out.assertFieldEquals("url", "https://mydomain.com/my/path/to/file.html?a=kaka&oc=&h=on#myfragment"); + out.assertFieldEquals("urlEncode", "https://mydomain.com/my/path/to/file.html?a=kaka&h=on#myfragment"); } + + } From 65c0f517061f5d7fd6499dda9b908c0f3c889fbe Mon Sep 17 00:00:00 2001 From: Gregoire Seguin-Henry Date: Thu, 3 Dec 2020 14:30:54 +0100 Subject: [PATCH 05/17] Implemented the URLCleaner with a custom algorithm. considering the query ti be between the first question mark '?' and then ended by the first number sign '#'. The query params are considered to be separated by '&' and their name and value by '='. --- .../processor/webAnalytics/URLCleaner.java | 8 ++-- .../modele/AbstractQueryParameterRemover.java | 32 ---------------- .../modele/AllQueryParameterRemover.java | 17 +-------- .../modele/KeepSomeQueryParameterRemover.java | 24 +----------- .../modele/QueryParameterRemover.java | 2 +- .../RemoveSomeQueryParameterRemover.java | 23 ++--------- .../webAnalytics/URLCleanerTest.java | 38 +++++++++++++++++++ 7 files changed, 50 insertions(+), 94 deletions(-) delete mode 100644 logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AbstractQueryParameterRemover.java diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLCleaner.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLCleaner.java index 9f17c81e0..02929cd2e 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLCleaner.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLCleaner.java @@ -219,14 +219,14 @@ private void tryUpdatingRecord(Record record, Map.Entry kv) { String inputFieldName = kv.getKey(); String outputFieldName = kv.getValue(); if (record.hasField(inputFieldName)) { - String value = record.getField(inputFieldName).asString();//TODO test if null in field + String value = record.getField(inputFieldName).asString(); if (value != null) { String cleanedUrl = null; try { cleanedUrl = remover.removeQueryParameters(value); - } catch (URISyntaxException | UnsupportedEncodingException e) { - getLogger().error("Error for url {}, for record {}.", new Object[]{value, record}, e); - String msg = "Could not parse url : '" + value + "' into URI.\n Cause: " + e.getMessage(); + } catch (Exception e) { + getLogger().error("Error for url {}, for record {}.", new Object[]{value, record.getId()}, e); + String msg = "Could not process url : '" + value + "'.\n Cause: " + e.getMessage(); record.addError(ProcessError.STRING_FORMAT_ERROR.toString(), getLogger(), msg); return; } diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AbstractQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AbstractQueryParameterRemover.java deleted file mode 100644 index 4ccd02cb6..000000000 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AbstractQueryParameterRemover.java +++ /dev/null @@ -1,32 +0,0 @@ -package com.hurence.logisland.processor.webAnalytics.modele; - -import org.apache.http.client.utils.URIBuilder; - -import java.io.UnsupportedEncodingException; -import java.net.URI; -import java.net.URISyntaxException; -import java.util.Arrays; - -public abstract class AbstractQueryParameterRemover implements QueryParameterRemover { - - protected abstract String removeQueryParameters(URIBuilder uriBuilder) throws URISyntaxException; - - public String removeQueryParameters(String url) throws UnsupportedEncodingException, URISyntaxException { - return tryHandlingCaseNotAValidURI(url); -// try { -// URIBuilder uriBuilder = new URIBuilder(url); -// return removeQueryParameters(uriBuilder); -// } catch (URISyntaxException e) { -// return tryHandlingCaseNotAValidURI(url); -// } - } - - /** - * If input is not a valid URI, this may be because the URI has already been decoded. - * @param urlStr - * @return - * @throws UnsupportedEncodingException - * @throws URISyntaxException - */ - protected abstract String tryHandlingCaseNotAValidURI(String urlStr) throws UnsupportedEncodingException, URISyntaxException; -} diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AllQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AllQueryParameterRemover.java index ee4ee6c23..8864f1cd4 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AllQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AllQueryParameterRemover.java @@ -1,21 +1,8 @@ package com.hurence.logisland.processor.webAnalytics.modele; -import org.apache.http.client.utils.URIBuilder; +public class AllQueryParameterRemover implements QueryParameterRemover { -import java.io.UnsupportedEncodingException; -import java.net.URISyntaxException; -import java.util.Arrays; - -public class AllQueryParameterRemover extends AbstractQueryParameterRemover implements QueryParameterRemover { - - @Override - protected String removeQueryParameters(URIBuilder uriBuilder) throws URISyntaxException { - uriBuilder.removeQuery(); - return uriBuilder.build().toString(); - } - - @Override - protected String tryHandlingCaseNotAValidURI(String urlStr) throws UnsupportedEncodingException, URISyntaxException { + public String removeQueryParameters(String urlStr) { SplittedURI guessSplittedURI = SplittedURI.fromMalFormedURI(urlStr); return guessSplittedURI.getBeforeQueryWithoutQuestionMark() + guessSplittedURI.getAfterQuery(); } diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/KeepSomeQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/KeepSomeQueryParameterRemover.java index 96bd2bf3f..1a1e9737c 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/KeepSomeQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/KeepSomeQueryParameterRemover.java @@ -1,16 +1,11 @@ package com.hurence.logisland.processor.webAnalytics.modele; -import org.apache.avro.reflect.MapEntry; -import org.apache.http.NameValuePair; -import org.apache.http.client.utils.URIBuilder; - import java.io.UnsupportedEncodingException; -import java.net.URI; import java.net.URISyntaxException; import java.util.*; import java.util.stream.Collectors; -public class KeepSomeQueryParameterRemover extends AbstractQueryParameterRemover implements QueryParameterRemover { +public class KeepSomeQueryParameterRemover implements QueryParameterRemover { final Set parameterToKeep; @@ -18,22 +13,7 @@ public KeepSomeQueryParameterRemover(Set parameterToKeep) { this.parameterToKeep = parameterToKeep; } - @Override - protected String removeQueryParameters(URIBuilder uriBuilder) throws URISyntaxException { - List queryParameters = uriBuilder.getQueryParams() - .stream() - .filter(p -> parameterToKeep.contains(p.getName())) - .collect(Collectors.toList()); - if (queryParameters.isEmpty()) { - uriBuilder.removeQuery(); - } else { - uriBuilder.setParameters(queryParameters); - } - return uriBuilder.build().toString(); - } - - @Override - protected String tryHandlingCaseNotAValidURI(String urlStr) throws UnsupportedEncodingException, URISyntaxException { + public String removeQueryParameters(String urlStr) { SplittedURI guessSplittedURI = SplittedURI.fromMalFormedURI(urlStr); Map paramsNameValue = Arrays.stream(guessSplittedURI.getQuery().split("&")) .map(queryString -> queryString.split("=")) diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/QueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/QueryParameterRemover.java index 99e772ac2..0ace42636 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/QueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/QueryParameterRemover.java @@ -4,5 +4,5 @@ import java.net.URISyntaxException; public interface QueryParameterRemover { - String removeQueryParameters(String url) throws URISyntaxException, UnsupportedEncodingException; + String removeQueryParameters(String url); } diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/RemoveSomeQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/RemoveSomeQueryParameterRemover.java index 939d326e7..a89f8ca59 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/RemoveSomeQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/RemoveSomeQueryParameterRemover.java @@ -1,14 +1,11 @@ package com.hurence.logisland.processor.webAnalytics.modele; -import org.apache.http.NameValuePair; -import org.apache.http.client.utils.URIBuilder; - import java.io.UnsupportedEncodingException; import java.net.URISyntaxException; import java.util.*; import java.util.stream.Collectors; -public class RemoveSomeQueryParameterRemover extends AbstractQueryParameterRemover implements QueryParameterRemover { +public class RemoveSomeQueryParameterRemover implements QueryParameterRemover { final Set parameterToRemove; @@ -16,23 +13,9 @@ public RemoveSomeQueryParameterRemover(Set parameterToRemove) { this.parameterToRemove = parameterToRemove; } - @Override - protected String removeQueryParameters(URIBuilder uriBuilder) throws URISyntaxException { - List queryParameters = uriBuilder.getQueryParams() - .stream() - .filter(p -> !parameterToRemove.contains(p.getName())) - .collect(Collectors.toList()); - if (queryParameters.isEmpty()) { - uriBuilder.removeQuery(); - } else { - uriBuilder.setParameters(queryParameters); - } - return uriBuilder.build().toString(); - } - - @Override - protected String tryHandlingCaseNotAValidURI(String urlStr) throws UnsupportedEncodingException, URISyntaxException { + public String removeQueryParameters(String urlStr) { SplittedURI guessSplittedURI = SplittedURI.fromMalFormedURI(urlStr); + if (guessSplittedURI.getQuery().isEmpty()) return urlStr; Map paramsNameValue = Arrays.stream(guessSplittedURI.getQuery().split("&")) .map(queryString -> queryString.split("=")) .collect(Collectors.toMap( diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLCleanerTest.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLCleanerTest.java index 6176fd659..02a0501f6 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLCleanerTest.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLCleanerTest.java @@ -578,5 +578,43 @@ public void specialTestDecodedQueryValuesInUrl() throws UnsupportedEncodingExcep } + @Test + public void testNullField() { + Record record = new StandardRecord(); + record.setField("url", FieldType.STRING, null); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, "url"); + testRunner.setProperty(URLCleaner.CONFLICT_RESOLUTION_POLICY, URLCleaner.OVERWRITE_EXISTING); + testRunner.setProperty(URLCleaner.REMOVE_PARAMS, "b"); + testRunner.assertValid(); + testRunner.enqueue(record); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(record.size()); + out.assertNullField("url"); + } + @Test + public void testEmptyField() { + Record record = new StandardRecord(); + record.setField("url", FieldType.STRING, ""); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, "url"); + testRunner.setProperty(URLCleaner.CONFLICT_RESOLUTION_POLICY, URLCleaner.OVERWRITE_EXISTING); + testRunner.setProperty(URLCleaner.REMOVE_PARAMS, "b"); + testRunner.assertValid(); + testRunner.enqueue(record); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(record.size()); + out.assertFieldEquals("url",""); + } } From c951f1efa168408c0fddcdcb779897815baa2e87 Mon Sep 17 00:00:00 2001 From: Gregoire Seguin-Henry Date: Thu, 3 Dec 2020 15:25:34 +0100 Subject: [PATCH 06/17] Factorized some code. Writed documentation on URLCleaner. Fixed some bugs. Added asChar method to PropertyValue. Added parameter to choose separator for parameters in query part of the uri. --- .../processor/webAnalytics/URLCleaner.java | 46 +++++++++++----- .../modele/AbstractQueryParameterRemover.java | 52 +++++++++++++++++++ .../modele/KeepSomeQueryParameterRemover.java | 37 +++---------- .../modele/QueryParameterRemover.java | 3 -- .../RemoveSomeQueryParameterRemover.java | 39 ++++---------- .../webAnalytics/URLCleanerTest.java | 42 ++++++++++++++- .../component/AbstractPropertyValue.java | 15 ++++++ .../logisland/component/PropertyValue.java | 12 +++-- .../validator/StandardValidators.java | 22 ++++++++ 9 files changed, 190 insertions(+), 78 deletions(-) create mode 100644 logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AbstractQueryParameterRemover.java diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLCleaner.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLCleaner.java index 02929cd2e..2a785c512 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLCleaner.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLCleaner.java @@ -15,7 +15,6 @@ */ package com.hurence.logisland.processor.webAnalytics; -import com.hurence.logisland.annotation.behavior.DynamicProperty; import com.hurence.logisland.annotation.documentation.CapabilityDescription; import com.hurence.logisland.annotation.documentation.ExtraDetailFile; import com.hurence.logisland.annotation.documentation.Tags; @@ -35,17 +34,20 @@ import com.hurence.logisland.validator.ValidationContext; import com.hurence.logisland.validator.ValidationResult; -import java.io.UnsupportedEncodingException; -import java.net.URISyntaxException; import java.util.*; -@Tags({"record", "fields", "Decode"}) -@CapabilityDescription("Decode one or more field containing an URL with possibly special chars encoded\n" + - "...") -@DynamicProperty(name = "fields to decode", - supportsExpressionLanguage = false, - value = "a default value", - description = "Decode one or more fields from the record ") +@Tags({"record", "fields", "url", "params", "param", "remove", "keep", "query", "uri", "parameter", "clean", "decoded", "raw"}) +@CapabilityDescription("Remove some or all query parameters from one or more field containing an uri which should be preferably encoded.\n" + + "If the uri is not encoded the behaviour is not defined in case the decoded uri contains '#', '?', '=', '&' which were encoded.\n" + + "Indeed this processor assumes that the start of query part of the uri start at the first '?' then end at the first '#' or at the end of the uri as\n"+ + "specified by rfc3986 available at https://tools.ietf.org/html/rfc3986#section-3.4. \n" + + "We assume as well that key value pairs are separed by '=', and are separed by '&': exemple 'param1=value1¶m2=value2'.\n" + + "The processor can remove also parameters that have only a name and no value. The character used to separate the key and the value '=' is configurable.\n" + + "The character used to separate two parameters '&' is also configurable.") +//Another solution could be to use the regex specified here https://tools.ietf.org/html/rfc3986#appendix-B +//to get the query part. +//the query part can be anything ! and is not necessary a list of key value apram... +//exemple http://host.com/path?mysyntax&pretty&size=2#anchor @ExtraDetailFile("./details/URLDecoder-Detail.rst") public class URLCleaner extends AbstractProcessor { @@ -101,6 +103,22 @@ public class URLCleaner extends AbstractProcessor { .addValidator(StandardValidators.BOOLEAN_VALIDATOR) .build(); + public static final PropertyDescriptor PARAMETER_SEPARATOR = new PropertyDescriptor.Builder() + .name("parameter.separator") + .description("the character to use to separate the parameters in the query part of the uris") + .required(true) + .defaultValue("&") + .addValidator(StandardValidators.CHAR_VALIDATOR) + .build(); + + public static final PropertyDescriptor KEY_VALUE_SEPARATOR = new PropertyDescriptor.Builder() + .name("key.value.separator") + .description("the character to use to separate the parameter name from the parameter value in the query part of the uris") + .required(true) + .defaultValue("=") + .addValidator(StandardValidators.CHAR_VALIDATOR) + .build(); + private final Map fieldsToDecodeToOutputField = new HashMap<>(); private String conflictPolicy; private QueryParameterRemover remover; @@ -113,6 +131,8 @@ public List getSupportedPropertyDescriptors() { descriptors.add(KEEP_PARAMS); descriptors.add(REMOVE_PARAMS); descriptors.add(REMOVE_ALL_PARAMS); + descriptors.add(PARAMETER_SEPARATOR); + descriptors.add(KEY_VALUE_SEPARATOR); return Collections.unmodifiableList(descriptors); } @@ -178,18 +198,20 @@ public void initFieldsToDecodeToOutputFiles(ProcessContext context) { } public void initRemover(ProcessContext context) throws InitializationException { + char keyValueSeparator = context.getPropertyValue(KEY_VALUE_SEPARATOR).asChar(); + char parameterSeparator = context.getPropertyValue(PARAMETER_SEPARATOR).asChar(); if (context.getPropertyValue(KEEP_PARAMS).isSet()) { String commaSeparatedKeepParams = context.getPropertyValue(KEEP_PARAMS).asString(); String[] keepParamsArr = commaSeparatedKeepParams.split("\\s*,\\s*"); final Set keepParams = new HashSet<>(Arrays.asList(keepParamsArr)); - this.remover = new KeepSomeQueryParameterRemover(keepParams); + this.remover = new KeepSomeQueryParameterRemover(keepParams, keyValueSeparator, parameterSeparator); return; } if (context.getPropertyValue(REMOVE_PARAMS).isSet()) { String commaSeparatedRemoveParam = context.getPropertyValue(REMOVE_PARAMS).asString(); String[] removeParamsArr = commaSeparatedRemoveParam.split("\\s*,\\s*"); final Set removeParams = new HashSet<>(Arrays.asList(removeParamsArr)); - this.remover = new RemoveSomeQueryParameterRemover(removeParams); + this.remover = new RemoveSomeQueryParameterRemover(removeParams, keyValueSeparator, parameterSeparator); return; } if (!context.getPropertyValue(REMOVE_ALL_PARAMS).isSet() || context.getPropertyValue(REMOVE_ALL_PARAMS).asBoolean()) { diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AbstractQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AbstractQueryParameterRemover.java new file mode 100644 index 000000000..9c5d2e024 --- /dev/null +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AbstractQueryParameterRemover.java @@ -0,0 +1,52 @@ +package com.hurence.logisland.processor.webAnalytics.modele; + +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public abstract class AbstractQueryParameterRemover implements QueryParameterRemover { + final char keyValueSeparator; + final char parameterSeparator; + + public AbstractQueryParameterRemover(char keyValueSeparator, char parameterSeparator) { + this.keyValueSeparator = keyValueSeparator; + this.parameterSeparator = parameterSeparator; + } + + public String removeQueryParameters(String urlStr) { + SplittedURI guessSplittedURI = SplittedURI.fromMalFormedURI(urlStr); + if (guessSplittedURI.getQuery().isEmpty()) return urlStr; + Map paramsNameValue = Arrays.stream(guessSplittedURI.getQuery().split(String.valueOf(parameterSeparator))) + .map(queryString -> queryString.split(String.valueOf(keyValueSeparator))) + .collect(Collectors.toMap( + keyValueArr -> keyValueArr[0], + keyValueArr -> { + String[] values = Arrays.copyOfRange(keyValueArr, 1, keyValueArr.length); + return String.join("", values); + }, + (x, y) -> y, + LinkedHashMap::new + )); + List> paramsNameValueFiltred = filterParams(paramsNameValue); + if (paramsNameValueFiltred.isEmpty()) { + return guessSplittedURI.getBeforeQueryWithoutQuestionMark() + guessSplittedURI.getAfterQuery(); + } else { + String newQueryString = paramsNameValueFiltred.stream() + .map(entry -> { + if (entry.getValue().isEmpty()) { + return entry.getKey(); + } else { + return entry.getKey() + keyValueSeparator + entry.getValue(); + } + }) + .collect(Collectors.joining(String.valueOf(parameterSeparator))); + return guessSplittedURI.getBeforeQuery() + + newQueryString + + guessSplittedURI.getAfterQuery(); + } + } + + protected abstract List> filterParams(Map paramsNameValue); +} diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/KeepSomeQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/KeepSomeQueryParameterRemover.java index 1a1e9737c..3c233f68b 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/KeepSomeQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/KeepSomeQueryParameterRemover.java @@ -1,44 +1,23 @@ package com.hurence.logisland.processor.webAnalytics.modele; -import java.io.UnsupportedEncodingException; -import java.net.URISyntaxException; -import java.util.*; +import java.util.List; +import java.util.Map; +import java.util.Set; import java.util.stream.Collectors; -public class KeepSomeQueryParameterRemover implements QueryParameterRemover { +public class KeepSomeQueryParameterRemover extends AbstractQueryParameterRemover implements QueryParameterRemover { final Set parameterToKeep; - public KeepSomeQueryParameterRemover(Set parameterToKeep) { + public KeepSomeQueryParameterRemover(Set parameterToKeep, char keyValueSeparator, char parameterSeparator) { + super(keyValueSeparator, parameterSeparator); this.parameterToKeep = parameterToKeep; } - public String removeQueryParameters(String urlStr) { - SplittedURI guessSplittedURI = SplittedURI.fromMalFormedURI(urlStr); - Map paramsNameValue = Arrays.stream(guessSplittedURI.getQuery().split("&")) - .map(queryString -> queryString.split("=")) - .collect(Collectors.toMap( - keyValueArr -> keyValueArr[0], - keyValueArr -> { - String[] values = Arrays.copyOfRange(keyValueArr, 1, keyValueArr.length); - return String.join("", values); - }, - (x, y) -> y, - LinkedHashMap::new - )); - + protected List> filterParams(Map paramsNameValue) { List> paramsNameValueFiltred = paramsNameValue.entrySet().stream() .filter(p -> parameterToKeep.contains(p.getKey())) .collect(Collectors.toList()); - if (paramsNameValueFiltred.isEmpty()) { - return guessSplittedURI.getBeforeQueryWithoutQuestionMark() + guessSplittedURI.getAfterQuery(); - } else { - String newQueryString = paramsNameValueFiltred.stream() - .map(entry -> entry.getKey() + "=" + entry.getValue()) - .collect(Collectors.joining("&")); - return guessSplittedURI.getBeforeQuery() + - newQueryString + - guessSplittedURI.getAfterQuery(); - } + return paramsNameValueFiltred; } } diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/QueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/QueryParameterRemover.java index 0ace42636..27453df6a 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/QueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/QueryParameterRemover.java @@ -1,8 +1,5 @@ package com.hurence.logisland.processor.webAnalytics.modele; -import java.io.UnsupportedEncodingException; -import java.net.URISyntaxException; - public interface QueryParameterRemover { String removeQueryParameters(String url); } diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/RemoveSomeQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/RemoveSomeQueryParameterRemover.java index a89f8ca59..422b39a5a 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/RemoveSomeQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/RemoveSomeQueryParameterRemover.java @@ -1,44 +1,25 @@ package com.hurence.logisland.processor.webAnalytics.modele; -import java.io.UnsupportedEncodingException; -import java.net.URISyntaxException; -import java.util.*; +import java.util.List; +import java.util.Map; +import java.util.Set; import java.util.stream.Collectors; -public class RemoveSomeQueryParameterRemover implements QueryParameterRemover { +public class RemoveSomeQueryParameterRemover extends AbstractQueryParameterRemover implements QueryParameterRemover { final Set parameterToRemove; - public RemoveSomeQueryParameterRemover(Set parameterToRemove) { + + public RemoveSomeQueryParameterRemover(Set parameterToRemove, char keyValueSeparator, char parameterSeparator) { + super(keyValueSeparator, parameterSeparator); this.parameterToRemove = parameterToRemove; } - public String removeQueryParameters(String urlStr) { - SplittedURI guessSplittedURI = SplittedURI.fromMalFormedURI(urlStr); - if (guessSplittedURI.getQuery().isEmpty()) return urlStr; - Map paramsNameValue = Arrays.stream(guessSplittedURI.getQuery().split("&")) - .map(queryString -> queryString.split("=")) - .collect(Collectors.toMap( - keyValueArr -> keyValueArr[0], - keyValueArr -> { - String[] values = Arrays.copyOfRange(keyValueArr, 1, keyValueArr.length); - return String.join("", values); - }, - (x, y) -> y, - LinkedHashMap::new - )); + @Override + protected List> filterParams(Map paramsNameValue) { List> paramsNameValueFiltred = paramsNameValue.entrySet().stream() .filter(p -> !parameterToRemove.contains(p.getKey())) .collect(Collectors.toList()); - if (paramsNameValueFiltred.isEmpty()) { - return guessSplittedURI.getBeforeQueryWithoutQuestionMark() + guessSplittedURI.getAfterQuery(); - } else { - String newQueryString = paramsNameValueFiltred.stream() - .map(entry -> entry.getKey() + "=" + entry.getValue()) - .collect(Collectors.joining("&")); - return guessSplittedURI.getBeforeQuery() + - newQueryString + - guessSplittedURI.getAfterQuery(); - } + return paramsNameValueFiltred; } } diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLCleanerTest.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLCleanerTest.java index 02a0501f6..6ec0db89a 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLCleanerTest.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLCleanerTest.java @@ -573,7 +573,7 @@ public void specialTestDecodedQueryValuesInUrl() throws UnsupportedEncodingExcep MockRecord out = testRunner.getOutputRecords().get(0); out.assertRecordSizeEquals(record.size()); - out.assertFieldEquals("url", "https://mydomain.com/my/path/to/file.html?a=kaka&oc=&h=on#myfragment"); + out.assertFieldEquals("url", "https://mydomain.com/my/path/to/file.html?a=kaka&oc&h=on#myfragment"); out.assertFieldEquals("urlEncode", "https://mydomain.com/my/path/to/file.html?a=kaka&h=on#myfragment"); } @@ -617,4 +617,44 @@ public void testEmptyField() { out.assertRecordSizeEquals(record.size()); out.assertFieldEquals("url",""); } + + @Test + public void testJustKeyInQuery() { + Record record = new StandardRecord(); + record.setField("url", FieldType.STRING, "http://host.com/path?mysyntax&pretty&size=2#anchor"); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, "url"); + testRunner.setProperty(URLCleaner.CONFLICT_RESOLUTION_POLICY, URLCleaner.OVERWRITE_EXISTING); + testRunner.setProperty(URLCleaner.REMOVE_PARAMS, "mysyntax"); + testRunner.assertValid(); + testRunner.enqueue(record); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(record.size()); + out.assertFieldEquals("url","http://host.com/path?pretty&size=2#anchor"); + } + + @Test + public void testJustKeyInQuery2() { + Record record = new StandardRecord(); + record.setField("url", FieldType.STRING, "http://host.com/path?mysyntax&pretty&size=2#anchor"); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, "url"); + testRunner.setProperty(URLCleaner.CONFLICT_RESOLUTION_POLICY, URLCleaner.OVERWRITE_EXISTING); + testRunner.setProperty(URLCleaner.REMOVE_PARAMS, "pretty"); + testRunner.assertValid(); + testRunner.enqueue(record); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(record.size()); + out.assertFieldEquals("url","http://host.com/path?mysyntax&size=2#anchor"); + } } diff --git a/logisland-core/logisland-api/src/main/java/com/hurence/logisland/component/AbstractPropertyValue.java b/logisland-core/logisland-api/src/main/java/com/hurence/logisland/component/AbstractPropertyValue.java index d2f7f8c0c..72abcd264 100644 --- a/logisland-core/logisland-api/src/main/java/com/hurence/logisland/component/AbstractPropertyValue.java +++ b/logisland-core/logisland-api/src/main/java/com/hurence/logisland/component/AbstractPropertyValue.java @@ -172,6 +172,21 @@ public char[] asChars() { } } + @Override + public char asChar() { + if (getRawValue() == null) { + throw new IllegalArgumentException("null is not a char"); + } else if (getRawValue() instanceof Character) { + return ((char) getRawValue()); + } else { + try { + return asString().charAt(0); + } catch (Exception ex) { + logger.error(" : unable to convert " + rawValue.toString() + " as a char", ex); + throw ex; + } + } + } @Override public boolean isSet() { diff --git a/logisland-core/logisland-api/src/main/java/com/hurence/logisland/component/PropertyValue.java b/logisland-core/logisland-api/src/main/java/com/hurence/logisland/component/PropertyValue.java index b528bb460..1c3574664 100644 --- a/logisland-core/logisland-api/src/main/java/com/hurence/logisland/component/PropertyValue.java +++ b/logisland-core/logisland-api/src/main/java/com/hurence/logisland/component/PropertyValue.java @@ -62,20 +62,24 @@ default Optional asStringOpt() { /** - * @return a byte[] representation of the property value, of - * null if not set + * @return a byte[] representation of the property value * @throws IllegalArgumentException if not able to parse */ byte[] asBytes(); /** - * @return a char[] representation of the property value, of - * null if not set + * @return a char[] representation of the property value * @throws IllegalArgumentException if not able to parse */ char[] asChars(); + /** + * @return a char representation of the property value + * @throws IllegalArgumentException if not able to parse + */ + char asChar(); + /** * @return a Record representation of the property value, or * null if not set diff --git a/logisland-core/logisland-api/src/main/java/com/hurence/logisland/validator/StandardValidators.java b/logisland-core/logisland-api/src/main/java/com/hurence/logisland/validator/StandardValidators.java index 854f9dfc9..7d96c1bd5 100644 --- a/logisland-core/logisland-api/src/main/java/com/hurence/logisland/validator/StandardValidators.java +++ b/logisland-core/logisland-api/src/main/java/com/hurence/logisland/validator/StandardValidators.java @@ -116,6 +116,28 @@ public ValidationResult validate(final String subject, final String value) { } }; + public static final Validator CHAR_VALIDATOR = new Validator() { + @Override + public ValidationResult validate(final String subject, final String value) { + String reason = null; + try { + if (value == null) { + reason = "null is not a valid character"; + } else { + if (value.length() != 1) { + reason = "Not a valid character !"; + } else { + char character = value.charAt(0); + } + } + } catch (final NumberFormatException e) { + reason = "Not a valid character"; + } + + return new ValidationResult.Builder().subject(subject).input(value).explanation(reason).valid(reason == null).build(); + } + }; + public static final Validator POSITIVE_DOUBLE_VALIDATOR = new Validator() { @Override public ValidationResult validate(final String subject, final String value) { From 08c7508d5448b292236d53a67ec980f1f42062a8 Mon Sep 17 00:00:00 2001 From: Gregoire Seguin-Henry Date: Thu, 3 Dec 2020 17:31:58 +0100 Subject: [PATCH 07/17] Added a URIDecoder processor that decodes a valid URI string. Using the java URI class. --- .../processor/webAnalytics/URIDecoder.java | 142 +++++++++++ .../webAnalytics/URIDecoderTest.java | 225 ++++++++++++++++++ .../webAnalytics/URLDecoderTest.java | 20 +- 3 files changed, 379 insertions(+), 8 deletions(-) create mode 100644 logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URIDecoder.java create mode 100644 logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URIDecoderTest.java diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URIDecoder.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URIDecoder.java new file mode 100644 index 000000000..739283c47 --- /dev/null +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URIDecoder.java @@ -0,0 +1,142 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.processor.webAnalytics; + +import com.hurence.logisland.annotation.documentation.CapabilityDescription; +import com.hurence.logisland.annotation.documentation.ExtraDetailFile; +import com.hurence.logisland.annotation.documentation.Tags; +import com.hurence.logisland.component.InitializationException; +import com.hurence.logisland.component.PropertyDescriptor; +import com.hurence.logisland.processor.AbstractProcessor; +import com.hurence.logisland.processor.ProcessContext; +import com.hurence.logisland.processor.ProcessError; +import com.hurence.logisland.record.FieldType; +import com.hurence.logisland.record.Record; +import com.hurence.logisland.validator.StandardValidators; +import com.sun.jndi.toolkit.url.Uri; +import org.apache.http.client.utils.URIBuilder; + +import java.io.UnsupportedEncodingException; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.charset.Charset; +import java.util.*; +import java.util.stream.Collectors; + +@Tags({"record", "fields", "Decode"}) +@CapabilityDescription("Decode one or more field containing an URI with possibly special chars encoded\n" + + "...") +@ExtraDetailFile("./details/URLDecoder-Detail.rst") +public class URIDecoder extends AbstractProcessor { + + private final static String UTF8_CHARSET = "UTF-8"; + + private static final PropertyDescriptor FIELDS_TO_DECODE_PROP = new PropertyDescriptor.Builder() + .name("decode.fields") + .description("List of fields (URL) to decode") + .required(true) + .addValidator(StandardValidators.COMMA_SEPARATED_LIST_VALIDATOR) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build(); + + private static final PropertyDescriptor CHARSET_PROP = new PropertyDescriptor.Builder() + .name("charset") + .description("Charset to use to decode the URL") + .required(true) + .defaultValue(UTF8_CHARSET) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build(); + + private Set fieldsToDecode; + private String charset; + + @Override + public List getSupportedPropertyDescriptors() { + final List descriptors = new ArrayList<>(); + descriptors.add(FIELDS_TO_DECODE_PROP); + descriptors.add(CHARSET_PROP); + return Collections.unmodifiableList(descriptors); + } + + + public void init(ProcessContext context) throws InitializationException { + super.init(context); + charset = context.getPropertyValue(CHARSET_PROP).asString(); + initFieldsToDecode(context); + } + + public void initFieldsToDecode(ProcessContext context) { + String commaSeparatedFields = context.getPropertyValue(FIELDS_TO_DECODE_PROP).asString(); + String[] fieldsArr = commaSeparatedFields.split("\\s*,\\s*"); + fieldsToDecode = new HashSet(); + Collections.addAll(fieldsToDecode, fieldsArr); + } + + @Override + public Collection process(ProcessContext context, Collection records) { + for (Record record : records) { + updateRecord(record); + } + return records; + } + + + private void updateRecord(Record record) { + fieldsToDecode.forEach(fieldName -> { + if (record.hasField(fieldName)) { + String uriStr = record.getField(fieldName).asString(); + if (uriStr != null && !uriStr.isEmpty()) { + decode(uriStr, record, fieldName); + } + } + }); + } + + private void decode(String uriStr, Record record, String fieldNameToSetDecodedUri) + { + try { + String decodedURI = uriToDecodedString(new URI(uriStr)); + if (!decodedURI.equals(uriStr)) { + final FieldType fieldType = record.getField(fieldNameToSetDecodedUri).getType(); + record.removeField(fieldNameToSetDecodedUri); + record.setField(fieldNameToSetDecodedUri, fieldType, decodedURI); + } + } catch (Exception e){ + getLogger().error("Error while trying to decode uri {}, for record {}.", new Object[]{uriStr, record.getId()}, e); + String msg = "Could not process uri : '" + uriStr + "'.\n Cause: " + e.getMessage(); + record.addError(ProcessError.STRING_FORMAT_ERROR.toString(), getLogger(), msg); + } + } + + + /**A URI is like + [:][#] + @see URI + */ + private String uriToDecodedString(URI uri) { + String uriStr = ""; + if (uri.getScheme() != null && !uri.getScheme().isEmpty()) { + uriStr += uri.getScheme() + ":"; + } + if (uri.getSchemeSpecificPart() != null && !uri.getSchemeSpecificPart().isEmpty()) { + uriStr += uri.getSchemeSpecificPart(); + } + if (uri.getFragment() != null && !uri.getFragment().isEmpty()) { + uriStr += "#" + uri.getFragment(); + } + return uriStr; + } +} diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URIDecoderTest.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URIDecoderTest.java new file mode 100644 index 000000000..4dca81084 --- /dev/null +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URIDecoderTest.java @@ -0,0 +1,225 @@ +/** + * Copyright (C) 2016 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.hurence.logisland.processor.webAnalytics; + +import com.hurence.logisland.processor.Processor; +import com.hurence.logisland.record.FieldType; +import com.hurence.logisland.record.Record; +import com.hurence.logisland.record.StandardRecord; +import com.hurence.logisland.util.runner.MockRecord; +import com.hurence.logisland.util.runner.TestRunner; +import com.hurence.logisland.util.runner.TestRunners; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class URIDecoderTest { + + public Processor getDecoder() { + return new URIDecoder(); + } + + private static final Logger logger = LoggerFactory.getLogger(URIDecoderTest.class); + + private static final String urlVal1 = "https://www.test.com/de/search/?text=toto"; + private static final String expectedDecodedUrlVal1 = urlVal1; + + private static final String urlVal2 = "https://www.test.com/de/search/?text=calendrier%20%20%202019"; + private static final String expectedDecodedUrlVal2 = "https://www.test.com/de/search/?text=calendrier 2019"; + + private static final String urlVal3 = "https://www.test.com/en/search/?text=key1+%20key2%20+%28key3-key4%29"; + private static final String expectedDecodedUrlVal3 = "https://www.test.com/en/search/?text=key1+ key2 +(key3-key4)"; + + private static final String val4 = "key1+%20key2%20+%28key3-key4%29"; + private static final String expectedDecodedVal4 = "key1+ key2 +(key3-key4)"; + + private static final String val5 = "%co"; + private static final String expectedDecodedVal5 = "%co"; + + private static final String val6 = "%%"; + private static final String expectedDecodedVal6 = "%%"; + + private static final String value1 = "value1"; + private static final String value2 = "value2"; + + + private Record getRecord1() { + Record record1 = new StandardRecord(); + record1.setField("string1", FieldType.STRING, value1); + record1.setField("string2", FieldType.STRING, value2); + record1.setField("long1", FieldType.LONG, 1); + record1.setField("long2", FieldType.LONG, 2); + record1.setField("url1", FieldType.STRING, urlVal1); + record1.setField("url2", FieldType.STRING, urlVal2); + record1.setField("url3", FieldType.STRING, urlVal3); + record1.setField("val4", FieldType.STRING, val4); + return record1; + } + + private Record getRecord2() { + Record record1 = new StandardRecord(); + record1.setField("string1", FieldType.STRING, value1); + record1.setField("string2", FieldType.STRING, value2); + record1.setField("long1", FieldType.LONG, 1); + record1.setField("long2", FieldType.LONG, 2); + record1.setField("url1", FieldType.STRING, urlVal1); + record1.setField("url2", FieldType.STRING, urlVal2); + record1.setField("val5", FieldType.STRING, val5); + record1.setField("val6", FieldType.STRING, val6); + return record1; + } + + + @Test + public void testNoURLValueField() { + + Record record1 = getRecord1(); + + TestRunner testRunner = TestRunners.newTestRunner(getDecoder()); + testRunner.setProperty("decode.fields", "string1"); + testRunner.assertValid(); + testRunner.enqueue(record1); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(record1.size()); + out.assertFieldEquals("string1", value1); + out.assertFieldTypeEquals("string1", FieldType.STRING); + } + + @Test + public void testBasicURLValueField() { + + Record record1 = getRecord1(); + + TestRunner testRunner = TestRunners.newTestRunner(getDecoder()); + testRunner.setProperty("decode.fields", "string1, url1"); + testRunner.assertValid(); + testRunner.enqueue(record1); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(record1.size()); + out.assertFieldEquals("string1", value1); + out.assertFieldEquals("url1", expectedDecodedUrlVal1); + } + + @Test + public void testEncodedURLValueField() { + + Record record1 = getRecord1(); + + TestRunner testRunner = TestRunners.newTestRunner(getDecoder()); + testRunner.setProperty("decode.fields", "url2,string1, url1"); + testRunner.assertValid(); + testRunner.enqueue(record1); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(record1.size()); + out.assertFieldEquals("string1", value1); + out.assertFieldEquals("url1", expectedDecodedUrlVal1); + out.assertFieldEquals("url2", expectedDecodedUrlVal2); + } + + @Test + public void testComplexEncodedURLValueField() { + + Record record1 = getRecord1(); + + TestRunner testRunner = TestRunners.newTestRunner(getDecoder()); + testRunner.setProperty("decode.fields", "url2,string1, url1, url3"); + testRunner.assertValid(); + testRunner.enqueue(record1); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(record1.size()); + out.assertFieldEquals("string1", value1); + out.assertFieldEquals("url1", expectedDecodedUrlVal1); + out.assertFieldEquals("url2", expectedDecodedUrlVal2); + out.assertFieldEquals("url3", expectedDecodedUrlVal3); + } + + @Test + public void testComplexEncodedValueField() { + + Record record1 = getRecord1(); + + TestRunner testRunner = TestRunners.newTestRunner(getDecoder()); + testRunner.setProperty("decode.fields", "url2,string1, url1, val4"); + testRunner.assertValid(); + testRunner.enqueue(record1); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(record1.size()); + out.assertFieldEquals("string1", value1); + out.assertFieldEquals("url1", expectedDecodedUrlVal1); + out.assertFieldEquals("url2", expectedDecodedUrlVal2); + out.assertFieldEquals("val4", expectedDecodedVal4); + } + + @Test + public void testNoMatchingField() { + + Record record1 = getRecord1(); + + TestRunner testRunner = TestRunners.newTestRunner(getDecoder()); + testRunner.setProperty("decode.fields", "nonExistingField"); + testRunner.assertValid(); + testRunner.enqueue(record1); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(record1.size()); + out.assertFieldEquals("string1", value1); + out.assertFieldEquals("url1", urlVal1); + out.assertFieldEquals("url2", urlVal2); + out.assertFieldEquals("val4", val4); + } + + @Test + public void testPercentButNotHexaField() { + + Record record2 = getRecord2(); + + TestRunner testRunner = TestRunners.newTestRunner(getDecoder()); + testRunner.setProperty("decode.fields", "val5,val6"); + testRunner.assertValid(); + testRunner.enqueue(record2); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(0); + testRunner.assertOutputErrorCount(1); + + MockRecord out = testRunner.getErrorRecords().get(0); + Assert.assertEquals(2, out.getErrors().size()); + } +} diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLDecoderTest.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLDecoderTest.java index b611c5f91..08e839c94 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLDecoderTest.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLDecoderTest.java @@ -15,6 +15,7 @@ */ package com.hurence.logisland.processor.webAnalytics; +import com.hurence.logisland.processor.Processor; import com.hurence.logisland.record.FieldType; import com.hurence.logisland.record.Record; import com.hurence.logisland.record.StandardRecord; @@ -27,6 +28,10 @@ public class URLDecoderTest { + public Processor getDecoder() { + return new URLDecoder(); + } + private static final Logger logger = LoggerFactory.getLogger(URLDecoderTest.class); private static final String urlVal1 = "https://www.test.com/de/search/?text=toto"; @@ -79,7 +84,7 @@ public void testNoURLValueField() { Record record1 = getRecord1(); - TestRunner testRunner = TestRunners.newTestRunner(new URLDecoder()); + TestRunner testRunner = TestRunners.newTestRunner(getDecoder()); testRunner.setProperty("decode.fields", "string1"); testRunner.assertValid(); testRunner.enqueue(record1); @@ -98,7 +103,7 @@ public void testBasicURLValueField() { Record record1 = getRecord1(); - TestRunner testRunner = TestRunners.newTestRunner(new URLDecoder()); + TestRunner testRunner = TestRunners.newTestRunner(getDecoder()); testRunner.setProperty("decode.fields", "string1, url1"); testRunner.assertValid(); testRunner.enqueue(record1); @@ -117,7 +122,7 @@ public void testEncodedURLValueField() { Record record1 = getRecord1(); - TestRunner testRunner = TestRunners.newTestRunner(new URLDecoder()); + TestRunner testRunner = TestRunners.newTestRunner(getDecoder()); testRunner.setProperty("decode.fields", "url2,string1, url1"); testRunner.assertValid(); testRunner.enqueue(record1); @@ -137,7 +142,7 @@ public void testComplexEncodedURLValueField() { Record record1 = getRecord1(); - TestRunner testRunner = TestRunners.newTestRunner(new URLDecoder()); + TestRunner testRunner = TestRunners.newTestRunner(getDecoder()); testRunner.setProperty("decode.fields", "url2,string1, url1, url3"); testRunner.assertValid(); testRunner.enqueue(record1); @@ -158,7 +163,7 @@ public void testComplexEncodedValueField() { Record record1 = getRecord1(); - TestRunner testRunner = TestRunners.newTestRunner(new URLDecoder()); + TestRunner testRunner = TestRunners.newTestRunner(getDecoder()); testRunner.setProperty("decode.fields", "url2,string1, url1, val4"); testRunner.assertValid(); testRunner.enqueue(record1); @@ -179,7 +184,7 @@ public void testNoMatchingField() { Record record1 = getRecord1(); - TestRunner testRunner = TestRunners.newTestRunner(new URLDecoder()); + TestRunner testRunner = TestRunners.newTestRunner(getDecoder()); testRunner.setProperty("decode.fields", "nonExistingField"); testRunner.assertValid(); testRunner.enqueue(record1); @@ -200,7 +205,7 @@ public void testPercentButNotHexaField() { Record record2 = getRecord2(); - TestRunner testRunner = TestRunners.newTestRunner(new URLDecoder()); + TestRunner testRunner = TestRunners.newTestRunner(getDecoder()); testRunner.setProperty("decode.fields", "val5,val6"); testRunner.assertValid(); testRunner.enqueue(record2); @@ -213,5 +218,4 @@ public void testPercentButNotHexaField() { out.assertFieldEquals("val5", expectedDecodedVal5); out.assertFieldEquals("val6", expectedDecodedVal6); } - } From 2a10942f700aff5d37e08ed77456f4919ccc9fea Mon Sep 17 00:00:00 2001 From: Gregoire Seguin-Henry Date: Thu, 3 Dec 2020 17:32:52 +0100 Subject: [PATCH 08/17] Removed unused dependency. --- .../logisland-processor-web-analytics/pom.xml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/pom.xml b/logisland-components/logisland-processors/logisland-processor-web-analytics/pom.xml index 2324a7bb1..b872b978e 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/pom.xml +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/pom.xml @@ -69,21 +69,12 @@ true - - org.apache.httpcomponents - httpclient - 4.5.12 - - - com.hurence.logisland logisland-processor-common ${project.version} test - - From 88068ce76d293837152a14e9f8dbea30a8b0a8a4 Mon Sep 17 00:00:00 2001 From: Gregoire Seguin-Henry Date: Thu, 3 Dec 2020 17:34:55 +0100 Subject: [PATCH 09/17] Removed unused property in URIDecoder --- .../logisland/processor/webAnalytics/URIDecoder.java | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URIDecoder.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URIDecoder.java index 739283c47..7eba41361 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URIDecoder.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URIDecoder.java @@ -52,29 +52,18 @@ public class URIDecoder extends AbstractProcessor { .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .build(); - private static final PropertyDescriptor CHARSET_PROP = new PropertyDescriptor.Builder() - .name("charset") - .description("Charset to use to decode the URL") - .required(true) - .defaultValue(UTF8_CHARSET) - .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) - .build(); - private Set fieldsToDecode; - private String charset; @Override public List getSupportedPropertyDescriptors() { final List descriptors = new ArrayList<>(); descriptors.add(FIELDS_TO_DECODE_PROP); - descriptors.add(CHARSET_PROP); return Collections.unmodifiableList(descriptors); } public void init(ProcessContext context) throws InitializationException { super.init(context); - charset = context.getPropertyValue(CHARSET_PROP).asString(); initFieldsToDecode(context); } From 49d3d0df9869aab251df8261b1243e4e59a77c71 Mon Sep 17 00:00:00 2001 From: Gregoire Seguin-Henry Date: Thu, 3 Dec 2020 17:37:12 +0100 Subject: [PATCH 10/17] replaced warning log when URLDecoder fails by an error added into the record. --- .../processor/webAnalytics/URLDecoder.java | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLDecoder.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLDecoder.java index f8ed6ead9..a7fd16d7c 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLDecoder.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLDecoder.java @@ -23,6 +23,7 @@ import com.hurence.logisland.component.PropertyDescriptor; import com.hurence.logisland.processor.AbstractProcessor; import com.hurence.logisland.processor.ProcessContext; +import com.hurence.logisland.processor.ProcessError; import com.hurence.logisland.record.FieldType; import com.hurence.logisland.record.Record; import com.hurence.logisland.validator.StandardValidators; @@ -121,12 +122,15 @@ private void decode (String value, String charset, Record record, String fieldNa if (tryTrick) { value = value.replaceAll("%(?![0-9a-fA-F]{2})", percentEncodedChar); decode(value, charset, record, fieldName, false); + } else { + getLogger().error("Error while trying to decode url {}, for record {}.", new Object[]{value, record.getId()}, e); + String msg = "Could not process url : '" + value + "'.\n Cause: " + e.getMessage(); + record.addError(ProcessError.STRING_FORMAT_ERROR.toString(), getLogger(), msg); } - else { - getLogger().warn(e.toString()); - } - } catch (Exception e){ - getLogger().warn(e.toString()); + } catch (Exception e) { + getLogger().error("Error while trying to decode url {}, for record {}.", new Object[]{value, record.getId()}, e); + String msg = "Could not process url : '" + value + "'.\n Cause: " + e.getMessage(); + record.addError(ProcessError.STRING_FORMAT_ERROR.toString(), getLogger(), msg); } } From 76809a84d271ddd7d1d670b01f7de66aa4ca53bc Mon Sep 17 00:00:00 2001 From: Gregoire Seguin-Henry Date: Thu, 3 Dec 2020 18:14:28 +0100 Subject: [PATCH 11/17] Removed unused import, Changed some property names in URLCleaner. --- .../logisland/processor/webAnalytics/URIDecoder.java | 6 ------ .../logisland/processor/webAnalytics/URLCleaner.java | 6 +++--- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URIDecoder.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URIDecoder.java index 7eba41361..dbd10948c 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URIDecoder.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URIDecoder.java @@ -26,15 +26,9 @@ import com.hurence.logisland.record.FieldType; import com.hurence.logisland.record.Record; import com.hurence.logisland.validator.StandardValidators; -import com.sun.jndi.toolkit.url.Uri; -import org.apache.http.client.utils.URIBuilder; -import java.io.UnsupportedEncodingException; import java.net.URI; -import java.net.URISyntaxException; -import java.nio.charset.Charset; import java.util.*; -import java.util.stream.Collectors; @Tags({"record", "fields", "Decode"}) @CapabilityDescription("Decode one or more field containing an URI with possibly special chars encoded\n" + diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLCleaner.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLCleaner.java index 2a785c512..389afca88 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLCleaner.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLCleaner.java @@ -74,9 +74,9 @@ public class URLCleaner extends AbstractProcessor { .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .build(); - public static final String PARAM_NAMES_INCLUDE_PROP_NAME = "param.names.include"; - public static final String REMOVE_PARAMS_PROP_NAME = "param.names.exclude"; - public static final String REMOVE_ALL_PARAMS_PROP_NAME = "remove.all.params"; + public static final String PARAM_NAMES_INCLUDE_PROP_NAME = "url.keep.params"; + public static final String REMOVE_PARAMS_PROP_NAME = "url.remove.params"; + public static final String REMOVE_ALL_PARAMS_PROP_NAME = "url.remove.all"; public static final PropertyDescriptor KEEP_PARAMS = new PropertyDescriptor.Builder() .name(PARAM_NAMES_INCLUDE_PROP_NAME) From beb0835df2119637ae4a418a0faf1689255b4f09 Mon Sep 17 00:00:00 2001 From: Gregoire Seguin-Henry Date: Thu, 3 Dec 2020 18:22:16 +0100 Subject: [PATCH 12/17] Renommed webAnalytics package to webanalytics as told by codacy. --- .../ConsolidateSession.java | 2 +- .../IncrementalWebSession.java | 2 +- .../SetSourceOfTraffic.java | 4 ++-- .../{webAnalytics => webanalytics}/URIDecoder.java | 2 +- .../{webAnalytics => webanalytics}/URLCleaner.java | 10 +++++----- .../{webAnalytics => webanalytics}/URLDecoder.java | 5 +---- .../modele/AbstractQueryParameterRemover.java | 2 +- .../modele/AllQueryParameterRemover.java | 2 +- .../modele/KeepSomeQueryParameterRemover.java | 2 +- .../modele/QueryParameterRemover.java | 2 +- .../modele/RemoveSomeQueryParameterRemover.java | 2 +- .../modele/SplittedURI.java | 2 +- .../ConsolidateSessionTest.java | 2 +- .../IncrementalWebSessionTest.java | 2 +- .../MockCacheService.java | 2 +- .../MockElasticsearchClientService.java | 3 +-- .../{webAnalytics => webanalytics}/URIDecoderTest.java | 2 +- .../{webAnalytics => webanalytics}/URLCleanerTest.java | 4 ++-- .../{webAnalytics => webanalytics}/URLDecoderTest.java | 2 +- .../setSourceOfTrafficTest.java | 4 ++-- 20 files changed, 27 insertions(+), 31 deletions(-) rename logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/{webAnalytics => webanalytics}/ConsolidateSession.java (99%) rename logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/{webAnalytics => webanalytics}/IncrementalWebSession.java (99%) rename logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/{webAnalytics => webanalytics}/SetSourceOfTraffic.java (99%) rename logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/{webAnalytics => webanalytics}/URIDecoder.java (98%) rename logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/{webAnalytics => webanalytics}/URLCleaner.java (97%) rename logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/{webAnalytics => webanalytics}/URLDecoder.java (97%) rename logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/{webAnalytics => webanalytics}/modele/AbstractQueryParameterRemover.java (97%) rename logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/{webAnalytics => webanalytics}/modele/AllQueryParameterRemover.java (84%) rename logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/{webAnalytics => webanalytics}/modele/KeepSomeQueryParameterRemover.java (93%) rename logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/{webAnalytics => webanalytics}/modele/QueryParameterRemover.java (59%) rename logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/{webAnalytics => webanalytics}/modele/RemoveSomeQueryParameterRemover.java (93%) rename logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/{webAnalytics => webanalytics}/modele/SplittedURI.java (97%) rename logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/{webAnalytics => webanalytics}/ConsolidateSessionTest.java (99%) rename logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/{webAnalytics => webanalytics}/IncrementalWebSessionTest.java (99%) rename logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/{webAnalytics => webanalytics}/MockCacheService.java (96%) rename logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/{webAnalytics => webanalytics}/MockElasticsearchClientService.java (98%) rename logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/{webAnalytics => webanalytics}/URIDecoderTest.java (99%) rename logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/{webAnalytics => webanalytics}/URLCleanerTest.java (99%) rename logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/{webAnalytics => webanalytics}/URLDecoderTest.java (99%) rename logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/{webAnalytics => webanalytics}/setSourceOfTrafficTest.java (99%) diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/ConsolidateSession.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/ConsolidateSession.java similarity index 99% rename from logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/ConsolidateSession.java rename to logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/ConsolidateSession.java index fb6d7d2fa..7e086ab1b 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/ConsolidateSession.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/ConsolidateSession.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.hurence.logisland.processor.webAnalytics; +package com.hurence.logisland.processor.webanalytics; import com.hurence.logisland.annotation.documentation.CapabilityDescription; import com.hurence.logisland.annotation.documentation.ExtraDetailFile; diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/IncrementalWebSession.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/IncrementalWebSession.java similarity index 99% rename from logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/IncrementalWebSession.java rename to logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/IncrementalWebSession.java index d8c9e025d..229d012ca 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/IncrementalWebSession.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/IncrementalWebSession.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.hurence.logisland.processor.webAnalytics; +package com.hurence.logisland.processor.webanalytics; import com.hurence.logisland.annotation.documentation.*; import com.hurence.logisland.classloading.PluginProxy; diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/SetSourceOfTraffic.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/SetSourceOfTraffic.java similarity index 99% rename from logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/SetSourceOfTraffic.java rename to logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/SetSourceOfTraffic.java index f8e36798c..faabd5981 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/SetSourceOfTraffic.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/SetSourceOfTraffic.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.hurence.logisland.processor.webAnalytics; +package com.hurence.logisland.processor.webanalytics; import com.hurence.logisland.annotation.documentation.*; import com.hurence.logisland.classloading.PluginProxy; @@ -39,7 +39,7 @@ import java.util.*; import java.util.regex.Pattern; -import static com.hurence.logisland.processor.webAnalytics.SetSourceOfTraffic.*; +import static com.hurence.logisland.processor.webanalytics.SetSourceOfTraffic.*; @Category(ComponentCategory.ANALYTICS) @Tags({"session", "traffic", "source", "web", "analytics"}) diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URIDecoder.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URIDecoder.java similarity index 98% rename from logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URIDecoder.java rename to logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URIDecoder.java index dbd10948c..19f04a344 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URIDecoder.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URIDecoder.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.hurence.logisland.processor.webAnalytics; +package com.hurence.logisland.processor.webanalytics; import com.hurence.logisland.annotation.documentation.CapabilityDescription; import com.hurence.logisland.annotation.documentation.ExtraDetailFile; diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLCleaner.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URLCleaner.java similarity index 97% rename from logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLCleaner.java rename to logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URLCleaner.java index 389afca88..01b8fde55 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLCleaner.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URLCleaner.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.hurence.logisland.processor.webAnalytics; +package com.hurence.logisland.processor.webanalytics; import com.hurence.logisland.annotation.documentation.CapabilityDescription; import com.hurence.logisland.annotation.documentation.ExtraDetailFile; @@ -24,10 +24,10 @@ import com.hurence.logisland.processor.AbstractProcessor; import com.hurence.logisland.processor.ProcessContext; import com.hurence.logisland.processor.ProcessError; -import com.hurence.logisland.processor.webAnalytics.modele.AllQueryParameterRemover; -import com.hurence.logisland.processor.webAnalytics.modele.KeepSomeQueryParameterRemover; -import com.hurence.logisland.processor.webAnalytics.modele.QueryParameterRemover; -import com.hurence.logisland.processor.webAnalytics.modele.RemoveSomeQueryParameterRemover; +import com.hurence.logisland.processor.webanalytics.modele.AllQueryParameterRemover; +import com.hurence.logisland.processor.webanalytics.modele.KeepSomeQueryParameterRemover; +import com.hurence.logisland.processor.webanalytics.modele.QueryParameterRemover; +import com.hurence.logisland.processor.webanalytics.modele.RemoveSomeQueryParameterRemover; import com.hurence.logisland.record.FieldType; import com.hurence.logisland.record.Record; import com.hurence.logisland.validator.StandardValidators; diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLDecoder.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URLDecoder.java similarity index 97% rename from logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLDecoder.java rename to logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URLDecoder.java index a7fd16d7c..fd1827fe5 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/URLDecoder.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URLDecoder.java @@ -13,9 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.hurence.logisland.processor.webAnalytics; +package com.hurence.logisland.processor.webanalytics; -import com.hurence.logisland.annotation.behavior.DynamicProperty; import com.hurence.logisland.annotation.documentation.CapabilityDescription; import com.hurence.logisland.annotation.documentation.ExtraDetailFile; import com.hurence.logisland.annotation.documentation.Tags; @@ -27,8 +26,6 @@ import com.hurence.logisland.record.FieldType; import com.hurence.logisland.record.Record; import com.hurence.logisland.validator.StandardValidators; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.UnsupportedEncodingException; import java.util.*; diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AbstractQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/AbstractQueryParameterRemover.java similarity index 97% rename from logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AbstractQueryParameterRemover.java rename to logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/AbstractQueryParameterRemover.java index 9c5d2e024..66f01c055 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AbstractQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/AbstractQueryParameterRemover.java @@ -1,4 +1,4 @@ -package com.hurence.logisland.processor.webAnalytics.modele; +package com.hurence.logisland.processor.webanalytics.modele; import java.util.Arrays; import java.util.LinkedHashMap; diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AllQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/AllQueryParameterRemover.java similarity index 84% rename from logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AllQueryParameterRemover.java rename to logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/AllQueryParameterRemover.java index 8864f1cd4..ff93114fb 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/AllQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/AllQueryParameterRemover.java @@ -1,4 +1,4 @@ -package com.hurence.logisland.processor.webAnalytics.modele; +package com.hurence.logisland.processor.webanalytics.modele; public class AllQueryParameterRemover implements QueryParameterRemover { diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/KeepSomeQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/KeepSomeQueryParameterRemover.java similarity index 93% rename from logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/KeepSomeQueryParameterRemover.java rename to logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/KeepSomeQueryParameterRemover.java index 3c233f68b..da1f96f0c 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/KeepSomeQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/KeepSomeQueryParameterRemover.java @@ -1,4 +1,4 @@ -package com.hurence.logisland.processor.webAnalytics.modele; +package com.hurence.logisland.processor.webanalytics.modele; import java.util.List; import java.util.Map; diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/QueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/QueryParameterRemover.java similarity index 59% rename from logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/QueryParameterRemover.java rename to logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/QueryParameterRemover.java index 27453df6a..1ea152028 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/QueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/QueryParameterRemover.java @@ -1,4 +1,4 @@ -package com.hurence.logisland.processor.webAnalytics.modele; +package com.hurence.logisland.processor.webanalytics.modele; public interface QueryParameterRemover { String removeQueryParameters(String url); diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/RemoveSomeQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/RemoveSomeQueryParameterRemover.java similarity index 93% rename from logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/RemoveSomeQueryParameterRemover.java rename to logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/RemoveSomeQueryParameterRemover.java index 422b39a5a..c13a110e5 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/RemoveSomeQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/RemoveSomeQueryParameterRemover.java @@ -1,4 +1,4 @@ -package com.hurence.logisland.processor.webAnalytics.modele; +package com.hurence.logisland.processor.webanalytics.modele; import java.util.List; import java.util.Map; diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/SplittedURI.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/SplittedURI.java similarity index 97% rename from logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/SplittedURI.java rename to logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/SplittedURI.java index 42dcc40fb..ca2296645 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webAnalytics/modele/SplittedURI.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/SplittedURI.java @@ -1,4 +1,4 @@ -package com.hurence.logisland.processor.webAnalytics.modele; +package com.hurence.logisland.processor.webanalytics.modele; public class SplittedURI { private String beforeQuery; diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/ConsolidateSessionTest.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/ConsolidateSessionTest.java similarity index 99% rename from logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/ConsolidateSessionTest.java rename to logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/ConsolidateSessionTest.java index bfdf66f94..a65d1cc4a 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/ConsolidateSessionTest.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/ConsolidateSessionTest.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.hurence.logisland.processor.webAnalytics; +package com.hurence.logisland.processor.webanalytics; import com.hurence.logisland.record.FieldType; import com.hurence.logisland.record.Record; diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/IncrementalWebSessionTest.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/IncrementalWebSessionTest.java similarity index 99% rename from logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/IncrementalWebSessionTest.java rename to logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/IncrementalWebSessionTest.java index d13c1a66c..ffc7de7c1 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/IncrementalWebSessionTest.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/IncrementalWebSessionTest.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.hurence.logisland.processor.webAnalytics; +package com.hurence.logisland.processor.webanalytics; import com.hurence.logisland.component.InitializationException; import com.hurence.logisland.component.PropertyDescriptor; diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/MockCacheService.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/MockCacheService.java similarity index 96% rename from logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/MockCacheService.java rename to logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/MockCacheService.java index f4f7b26b7..879598586 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/MockCacheService.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/MockCacheService.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.hurence.logisland.processor.webAnalytics; +package com.hurence.logisland.processor.webanalytics; import com.hurence.logisland.component.PropertyDescriptor; import com.hurence.logisland.controller.AbstractControllerService; diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/MockElasticsearchClientService.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/MockElasticsearchClientService.java similarity index 98% rename from logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/MockElasticsearchClientService.java rename to logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/MockElasticsearchClientService.java index d4229af09..f8eed8a1c 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/MockElasticsearchClientService.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/MockElasticsearchClientService.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.hurence.logisland.processor.webAnalytics; +package com.hurence.logisland.processor.webanalytics; import com.hurence.logisland.component.PropertyDescriptor; import com.hurence.logisland.controller.AbstractControllerService; @@ -23,7 +23,6 @@ import com.hurence.logisland.service.datastore.MultiGetQueryRecord; import com.hurence.logisland.service.datastore.MultiGetResponseRecord; -import java.io.IOException; import java.util.*; public class MockElasticsearchClientService extends AbstractControllerService implements ElasticsearchClientService { diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URIDecoderTest.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URIDecoderTest.java similarity index 99% rename from logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URIDecoderTest.java rename to logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URIDecoderTest.java index 4dca81084..b9c92b029 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URIDecoderTest.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URIDecoderTest.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.hurence.logisland.processor.webAnalytics; +package com.hurence.logisland.processor.webanalytics; import com.hurence.logisland.processor.Processor; import com.hurence.logisland.record.FieldType; diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLCleanerTest.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLCleanerTest.java similarity index 99% rename from logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLCleanerTest.java rename to logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLCleanerTest.java index 6ec0db89a..2264988ad 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLCleanerTest.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLCleanerTest.java @@ -13,9 +13,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.hurence.logisland.processor.webAnalytics; +package com.hurence.logisland.processor.webanalytics; -import com.hurence.logisland.processor.webAnalytics.modele.SplittedURI; +import com.hurence.logisland.processor.webanalytics.modele.SplittedURI; import com.hurence.logisland.record.FieldType; import com.hurence.logisland.record.Record; import com.hurence.logisland.record.StandardRecord; diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLDecoderTest.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLDecoderTest.java similarity index 99% rename from logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLDecoderTest.java rename to logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLDecoderTest.java index 08e839c94..258afd5ca 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/URLDecoderTest.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLDecoderTest.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.hurence.logisland.processor.webAnalytics; +package com.hurence.logisland.processor.webanalytics; import com.hurence.logisland.processor.Processor; import com.hurence.logisland.record.FieldType; diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/setSourceOfTrafficTest.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/setSourceOfTrafficTest.java similarity index 99% rename from logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/setSourceOfTrafficTest.java rename to logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/setSourceOfTrafficTest.java index 5aa176deb..93f191a40 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webAnalytics/setSourceOfTrafficTest.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/setSourceOfTrafficTest.java @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.hurence.logisland.processor.webAnalytics; +package com.hurence.logisland.processor.webanalytics; import com.hurence.logisland.component.InitializationException; import com.hurence.logisland.record.Field; @@ -27,7 +27,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static com.hurence.logisland.processor.webAnalytics.SetSourceOfTraffic.ES_INDEX_FIELD; +import static com.hurence.logisland.processor.webanalytics.SetSourceOfTraffic.ES_INDEX_FIELD; public class setSourceOfTrafficTest { From 125fcb20f1ffa543a9badeca2b8acaa0afb81201 Mon Sep 17 00:00:00 2001 From: Gregoire Seguin-Henry Date: Thu, 3 Dec 2020 18:26:24 +0100 Subject: [PATCH 13/17] Fixed most problems told by codacy. --- .../logisland/processor/webanalytics/URIDecoder.java | 2 -- .../modele/AbstractQueryParameterRemover.java | 4 ++-- .../modele/KeepSomeQueryParameterRemover.java | 2 +- .../modele/RemoveSomeQueryParameterRemover.java | 2 +- .../processor/webanalytics/URIDecoderTest.java | 4 ---- .../processor/webanalytics/URLCleanerTest.java | 10 +--------- .../processor/webanalytics/URLDecoderTest.java | 2 -- .../logisland/validator/StandardValidators.java | 2 +- 8 files changed, 6 insertions(+), 22 deletions(-) diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URIDecoder.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URIDecoder.java index 19f04a344..2c8d8c5b9 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URIDecoder.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URIDecoder.java @@ -36,8 +36,6 @@ @ExtraDetailFile("./details/URLDecoder-Detail.rst") public class URIDecoder extends AbstractProcessor { - private final static String UTF8_CHARSET = "UTF-8"; - private static final PropertyDescriptor FIELDS_TO_DECODE_PROP = new PropertyDescriptor.Builder() .name("decode.fields") .description("List of fields (URL) to decode") diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/AbstractQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/AbstractQueryParameterRemover.java index 66f01c055..cf32935e2 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/AbstractQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/AbstractQueryParameterRemover.java @@ -7,8 +7,8 @@ import java.util.stream.Collectors; public abstract class AbstractQueryParameterRemover implements QueryParameterRemover { - final char keyValueSeparator; - final char parameterSeparator; + private final char keyValueSeparator; + private final char parameterSeparator; public AbstractQueryParameterRemover(char keyValueSeparator, char parameterSeparator) { this.keyValueSeparator = keyValueSeparator; diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/KeepSomeQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/KeepSomeQueryParameterRemover.java index da1f96f0c..6f26737b9 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/KeepSomeQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/KeepSomeQueryParameterRemover.java @@ -7,7 +7,7 @@ public class KeepSomeQueryParameterRemover extends AbstractQueryParameterRemover implements QueryParameterRemover { - final Set parameterToKeep; + private final Set parameterToKeep; public KeepSomeQueryParameterRemover(Set parameterToKeep, char keyValueSeparator, char parameterSeparator) { super(keyValueSeparator, parameterSeparator); diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/RemoveSomeQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/RemoveSomeQueryParameterRemover.java index c13a110e5..f3b001bad 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/RemoveSomeQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/RemoveSomeQueryParameterRemover.java @@ -7,7 +7,7 @@ public class RemoveSomeQueryParameterRemover extends AbstractQueryParameterRemover implements QueryParameterRemover { - final Set parameterToRemove; + private final Set parameterToRemove; public RemoveSomeQueryParameterRemover(Set parameterToRemove, char keyValueSeparator, char parameterSeparator) { diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URIDecoderTest.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URIDecoderTest.java index b9c92b029..7d360a3b8 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URIDecoderTest.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URIDecoderTest.java @@ -33,8 +33,6 @@ public Processor getDecoder() { return new URIDecoder(); } - private static final Logger logger = LoggerFactory.getLogger(URIDecoderTest.class); - private static final String urlVal1 = "https://www.test.com/de/search/?text=toto"; private static final String expectedDecodedUrlVal1 = urlVal1; @@ -48,10 +46,8 @@ public Processor getDecoder() { private static final String expectedDecodedVal4 = "key1+ key2 +(key3-key4)"; private static final String val5 = "%co"; - private static final String expectedDecodedVal5 = "%co"; private static final String val6 = "%%"; - private static final String expectedDecodedVal6 = "%%"; private static final String value1 = "value1"; private static final String value2 = "value2"; diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLCleanerTest.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLCleanerTest.java index 2264988ad..788504d6a 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLCleanerTest.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLCleanerTest.java @@ -34,8 +34,6 @@ public class URLCleanerTest { - private static final Logger logger = LoggerFactory.getLogger(URLCleanerTest.class); - private static final String url1 = "https://www.test.com/de/search/?text=toto"; private static final String expectedUrl1WithoutParams = "https://www.test.com/de/search/"; private static final String expectedUrl1KeepText = url1; @@ -43,13 +41,11 @@ public class URLCleanerTest { private static final String url2 = "https://www.t%888est%20.com/de/search/?text=calendrier%20%20%202019"; private static final String expectedUrl2WithoutParams = "https://www.t%888est%20.com/de/search/"; -// private static final String expectedUrl2KeepText = "https://www.t%888est%20.com/de/search/?text=calendrier+++2019"; private static final String expectedUrl2KeepText = url2; private static final String expectedUrl2RemoveText = expectedUrl2WithoutParams; private static final String url3 = "https://www.test.com/en/search/?text=key1+%20key2%20+%28key3-key4%29"; private static final String expectedUrl3WithoutParams = "https://www.test.com/en/search/"; -// private static final String expectedUrl3KeepText = "https://www.test.com/en/search/?text=key1++key2++%28key3-key4%29"; private static final String expectedUrl3KeepText = url3; private static final String expectedUrl3RemoveText = expectedUrl3WithoutParams; @@ -74,12 +70,8 @@ public class URLCleanerTest { private static final String expectedVal1WithoutParams = "key1+%20key2%20+%28key3-key4%29"; private static final String val2 = "%co"; - private static final String expectedVal5 = "%co"; - private static final String expectedVal2WithoutParams = "%co"; - private static final String val3 = "%%"; - private static final String expectedVal6 = "%%"; - private static final String expectedVal3WithoutParams = "%%"; + @Test public void testUriBuilder() throws URISyntaxException { diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLDecoderTest.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLDecoderTest.java index 258afd5ca..f771cd216 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLDecoderTest.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLDecoderTest.java @@ -32,8 +32,6 @@ public Processor getDecoder() { return new URLDecoder(); } - private static final Logger logger = LoggerFactory.getLogger(URLDecoderTest.class); - private static final String urlVal1 = "https://www.test.com/de/search/?text=toto"; private static final String expectedDecodedUrlVal1 = urlVal1; diff --git a/logisland-core/logisland-api/src/main/java/com/hurence/logisland/validator/StandardValidators.java b/logisland-core/logisland-api/src/main/java/com/hurence/logisland/validator/StandardValidators.java index 7d96c1bd5..0f2144caa 100644 --- a/logisland-core/logisland-api/src/main/java/com/hurence/logisland/validator/StandardValidators.java +++ b/logisland-core/logisland-api/src/main/java/com/hurence/logisland/validator/StandardValidators.java @@ -127,7 +127,7 @@ public ValidationResult validate(final String subject, final String value) { if (value.length() != 1) { reason = "Not a valid character !"; } else { - char character = value.charAt(0); + value.charAt(0); } } } catch (final NumberFormatException e) { From 09c297097120021beb776d622c067ae05a7595ed Mon Sep 17 00:00:00 2001 From: Gregoire Seguin-Henry Date: Thu, 3 Dec 2020 18:29:59 +0100 Subject: [PATCH 14/17] Added apache license everyWhere. --- .../processor/webanalytics/URIDecoder.java | 2 +- .../processor/webanalytics/URLCleaner.java | 2 +- .../modele/AbstractQueryParameterRemover.java | 15 +++++++++++++++ .../modele/AllQueryParameterRemover.java | 15 +++++++++++++++ .../modele/KeepSomeQueryParameterRemover.java | 15 +++++++++++++++ .../modele/QueryParameterRemover.java | 15 +++++++++++++++ .../modele/RemoveSomeQueryParameterRemover.java | 15 +++++++++++++++ .../webanalytics/modele/SplittedURI.java | 15 +++++++++++++++ .../processor/webanalytics/URIDecoderTest.java | 4 +--- .../processor/webanalytics/URLCleanerTest.java | 4 +--- .../processor/webanalytics/URLDecoderTest.java | 2 -- 11 files changed, 94 insertions(+), 10 deletions(-) diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URIDecoder.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URIDecoder.java index 2c8d8c5b9..73db79e72 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URIDecoder.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URIDecoder.java @@ -1,5 +1,5 @@ /** - * Copyright (C) 2016 Hurence (support@hurence.com) + * Copyright (C) 2020 Hurence (support@hurence.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URLCleaner.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URLCleaner.java index 01b8fde55..61ce9f295 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URLCleaner.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URLCleaner.java @@ -1,5 +1,5 @@ /** - * Copyright (C) 2016 Hurence (support@hurence.com) + * Copyright (C) 2020 Hurence (support@hurence.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/AbstractQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/AbstractQueryParameterRemover.java index cf32935e2..b579fea3c 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/AbstractQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/AbstractQueryParameterRemover.java @@ -1,3 +1,18 @@ +/** + * Copyright (C) 2020 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.hurence.logisland.processor.webanalytics.modele; import java.util.Arrays; diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/AllQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/AllQueryParameterRemover.java index ff93114fb..af6c7777a 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/AllQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/AllQueryParameterRemover.java @@ -1,3 +1,18 @@ +/** + * Copyright (C) 2020 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.hurence.logisland.processor.webanalytics.modele; public class AllQueryParameterRemover implements QueryParameterRemover { diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/KeepSomeQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/KeepSomeQueryParameterRemover.java index 6f26737b9..a21f7ea30 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/KeepSomeQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/KeepSomeQueryParameterRemover.java @@ -1,3 +1,18 @@ +/** + * Copyright (C) 2020 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.hurence.logisland.processor.webanalytics.modele; import java.util.List; diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/QueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/QueryParameterRemover.java index 1ea152028..b48ea9f05 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/QueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/QueryParameterRemover.java @@ -1,3 +1,18 @@ +/** + * Copyright (C) 2020 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.hurence.logisland.processor.webanalytics.modele; public interface QueryParameterRemover { diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/RemoveSomeQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/RemoveSomeQueryParameterRemover.java index f3b001bad..210d86d09 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/RemoveSomeQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/RemoveSomeQueryParameterRemover.java @@ -1,3 +1,18 @@ +/** + * Copyright (C) 2020 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.hurence.logisland.processor.webanalytics.modele; import java.util.List; diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/SplittedURI.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/SplittedURI.java index ca2296645..310981845 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/SplittedURI.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/SplittedURI.java @@ -1,3 +1,18 @@ +/** + * Copyright (C) 2020 Hurence (support@hurence.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.hurence.logisland.processor.webanalytics.modele; public class SplittedURI { diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URIDecoderTest.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URIDecoderTest.java index 7d360a3b8..9a309d0f8 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URIDecoderTest.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URIDecoderTest.java @@ -1,5 +1,5 @@ /** - * Copyright (C) 2016 Hurence (support@hurence.com) + * Copyright (C) 2020 Hurence (support@hurence.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,8 +24,6 @@ import com.hurence.logisland.util.runner.TestRunners; import org.junit.Assert; import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class URIDecoderTest { diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLCleanerTest.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLCleanerTest.java index 788504d6a..10051378c 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLCleanerTest.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLCleanerTest.java @@ -1,5 +1,5 @@ /** - * Copyright (C) 2016 Hurence (support@hurence.com) + * Copyright (C) 2020 Hurence (support@hurence.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,8 +24,6 @@ import com.hurence.logisland.util.runner.TestRunners; import org.junit.Assert; import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.UnsupportedEncodingException; import java.net.URISyntaxException; diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLDecoderTest.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLDecoderTest.java index f771cd216..510c16ec3 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLDecoderTest.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLDecoderTest.java @@ -23,8 +23,6 @@ import com.hurence.logisland.util.runner.TestRunner; import com.hurence.logisland.util.runner.TestRunners; import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class URLDecoderTest { From 4c295a984b8811f9d05d1223b45e79273ae8556e Mon Sep 17 00:00:00 2001 From: Gregoire Seguin-Henry Date: Fri, 4 Dec 2020 09:40:25 +0100 Subject: [PATCH 15/17] Fixed codacy remarks except one that is wrong. --- .../processor/webanalytics/SetSourceOfTraffic.java | 4 ++-- .../logisland/processor/webanalytics/URIDecoderTest.java | 7 +++---- .../logisland/processor/webanalytics/URLCleanerTest.java | 7 +------ 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/SetSourceOfTraffic.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/SetSourceOfTraffic.java index faabd5981..7a475424f 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/SetSourceOfTraffic.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/SetSourceOfTraffic.java @@ -25,17 +25,17 @@ import com.hurence.logisland.record.FieldType; import com.hurence.logisland.record.Record; import com.hurence.logisland.service.cache.CacheService; -import com.hurence.logisland.service.elasticsearch.ElasticsearchClientService; import com.hurence.logisland.service.datastore.InvalidMultiGetQueryRecordException; import com.hurence.logisland.service.datastore.MultiGetQueryRecord; import com.hurence.logisland.service.datastore.MultiGetQueryRecordBuilder; import com.hurence.logisland.service.datastore.MultiGetResponseRecord; +import com.hurence.logisland.service.elasticsearch.ElasticsearchClientService; import com.hurence.logisland.validator.StandardValidators; import org.apache.commons.collections.map.HashedMap; import java.io.UnsupportedEncodingException; -import java.net.*; import java.net.URLDecoder; +import java.net.*; import java.util.*; import java.util.regex.Pattern; diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URIDecoderTest.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URIDecoderTest.java index 9a309d0f8..d72853585 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URIDecoderTest.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URIDecoderTest.java @@ -27,10 +27,6 @@ public class URIDecoderTest { - public Processor getDecoder() { - return new URIDecoder(); - } - private static final String urlVal1 = "https://www.test.com/de/search/?text=toto"; private static final String expectedDecodedUrlVal1 = urlVal1; @@ -50,6 +46,9 @@ public Processor getDecoder() { private static final String value1 = "value1"; private static final String value2 = "value2"; + public Processor getDecoder() { + return new URIDecoder(); + } private Record getRecord1() { Record record1 = new StandardRecord(); diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLCleanerTest.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLCleanerTest.java index 10051378c..98a1aad2a 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLCleanerTest.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLCleanerTest.java @@ -72,33 +72,28 @@ public class URLCleanerTest { @Test - public void testUriBuilder() throws URISyntaxException { + public void testSplittedUri() { SplittedURI url = SplittedURI.fromMalFormedURI("https://www.orexad.com/fr/search?q=sauterelle%7C%7Crelevance&page=2&sort=relevance"); -// private static final String url4 = "https://www.orexad.com/fr/search?q=sauterelle%7C%7Crelevance&page=2&sort=relevance"; Assert.assertEquals("https://www.orexad.com/fr/search?", url.getBeforeQuery()); Assert.assertEquals("q=sauterelle%7C%7Crelevance&page=2&sort=relevance", url.getQuery()); Assert.assertEquals("", url.getAfterQuery()); SplittedURI urlDecoded = SplittedURI.fromMalFormedURI("https://www.orexad.com/fr/search?q=sauterelle||relevance&page=2&sort=relevance"); -// private static final String url4Decoded = "https://www.orexad.com/fr/search?q=sauterelle||relevance&page=2&sort=relevance"; Assert.assertEquals("https://www.orexad.com/fr/search?", urlDecoded.getBeforeQuery()); Assert.assertEquals("q=sauterelle||relevance&page=2&sort=relevance", urlDecoded.getQuery()); Assert.assertEquals("", urlDecoded.getAfterQuery()); SplittedURI urlWithHashTag = SplittedURI.fromMalFormedURI("https://www.orexad.com/fr/search?q=sauterelle||relevance&page=2&sort=relevance#myTitle"); -// private static final String url4Decoded = "https://www.orexad.com/fr/search?q=sauterelle||relevance&page=2&sort=relevance"; Assert.assertEquals("https://www.orexad.com/fr/search?", urlWithHashTag.getBeforeQuery()); Assert.assertEquals("q=sauterelle||relevance&page=2&sort=relevance", urlWithHashTag.getQuery()); Assert.assertEquals("#myTitle", urlWithHashTag.getAfterQuery()); SplittedURI simpleUrl = SplittedURI.fromMalFormedURI("https://www.orexad.com/fr/"); -// private static final String url4Decoded = "https://www.orexad.com/fr/search?q=sauterelle||relevance&page=2&sort=relevance"; Assert.assertEquals("https://www.orexad.com/fr/", simpleUrl.getBeforeQuery()); Assert.assertEquals("", simpleUrl.getQuery()); Assert.assertEquals("", simpleUrl.getAfterQuery()); SplittedURI simpleUrlWithFragment = SplittedURI.fromMalFormedURI("https://www.orexad.com/fr/#gggg"); -// private static final String url4Decoded = "https://www.orexad.com/fr/search?q=sauterelle||relevance&page=2&sort=relevance"; Assert.assertEquals("https://www.orexad.com/fr/", simpleUrlWithFragment.getBeforeQuery()); Assert.assertEquals("", simpleUrlWithFragment.getQuery()); Assert.assertEquals("#gggg", simpleUrlWithFragment.getAfterQuery()); From 3096256df1bd6343b641cd1ae0d4b41a2ee6abce Mon Sep 17 00:00:00 2001 From: Gregoire Seguin-Henry Date: Fri, 4 Dec 2020 12:09:41 +0100 Subject: [PATCH 16/17] Added around 30 new urls as tests. Fixed a minor bugs, any query like 'a=&c' was transformed into 'a&c' therefore was not keeping the '=' was here. --- .../modele/AbstractQueryParameterRemover.java | 27 +- .../webanalytics/URLCleanerTest.java | 468 +++++++++++++++++- 2 files changed, 485 insertions(+), 10 deletions(-) diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/AbstractQueryParameterRemover.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/AbstractQueryParameterRemover.java index b579fea3c..9746b1c06 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/AbstractQueryParameterRemover.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/modele/AbstractQueryParameterRemover.java @@ -34,23 +34,32 @@ public String removeQueryParameters(String urlStr) { SplittedURI guessSplittedURI = SplittedURI.fromMalFormedURI(urlStr); if (guessSplittedURI.getQuery().isEmpty()) return urlStr; Map paramsNameValue = Arrays.stream(guessSplittedURI.getQuery().split(String.valueOf(parameterSeparator))) - .map(queryString -> queryString.split(String.valueOf(keyValueSeparator))) - .collect(Collectors.toMap( - keyValueArr -> keyValueArr[0], - keyValueArr -> { + .map(queryString -> { + String[] split = queryString.split(String.valueOf(keyValueSeparator)); + if (split.length==1 && queryString.contains(String.valueOf(keyValueSeparator))) { + return new String[]{split[0], ""}; + } else { + return split; + } + }) + .collect(LinkedHashMap::new, + (map, keyValueArr) -> { String[] values = Arrays.copyOfRange(keyValueArr, 1, keyValueArr.length); - return String.join("", values); + if (values.length == 0) { + map.put(keyValueArr[0], null); + } else { + map.put(keyValueArr[0], String.join("", values)); + } }, - (x, y) -> y, - LinkedHashMap::new - )); + LinkedHashMap::putAll); + List> paramsNameValueFiltred = filterParams(paramsNameValue); if (paramsNameValueFiltred.isEmpty()) { return guessSplittedURI.getBeforeQueryWithoutQuestionMark() + guessSplittedURI.getAfterQuery(); } else { String newQueryString = paramsNameValueFiltred.stream() .map(entry -> { - if (entry.getValue().isEmpty()) { + if (entry.getValue() == null) { return entry.getKey(); } else { return entry.getKey() + keyValueSeparator + entry.getValue(); diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLCleanerTest.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLCleanerTest.java index 98a1aad2a..cf73113f1 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLCleanerTest.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/test/java/com/hurence/logisland/processor/webanalytics/URLCleanerTest.java @@ -15,6 +15,7 @@ */ package com.hurence.logisland.processor.webanalytics; +import com.hurence.logisland.component.PropertyDescriptor; import com.hurence.logisland.processor.webanalytics.modele.SplittedURI; import com.hurence.logisland.record.FieldType; import com.hurence.logisland.record.Record; @@ -26,9 +27,14 @@ import org.junit.Test; import java.io.UnsupportedEncodingException; -import java.net.URISyntaxException; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.IntStream; public class URLCleanerTest { @@ -642,4 +648,464 @@ public void testJustKeyInQuery2() { out.assertRecordSizeEquals(record.size()); out.assertFieldEquals("url","http://host.com/path?mysyntax&size=2#anchor"); } + + @Test + public void testQuerywithoutValueAndEmptyValue() { + Record record = new StandardRecord(); + record.setField("url", FieldType.STRING, "http://host.com/path?a=b&c=&d&z=w"); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, "url"); + testRunner.setProperty(URLCleaner.CONFLICT_RESOLUTION_POLICY, URLCleaner.OVERWRITE_EXISTING); + testRunner.setProperty(URLCleaner.REMOVE_PARAMS, "pretty"); + testRunner.assertValid(); + testRunner.enqueue(record); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(record.size()); + out.assertFieldEquals("url","http://host.com/path?a=b&c=&d&z=w"); + } + + @Test + public void bulkRemoveAllTest() { + Map inputUrlToExpectedUrl = new HashMap(); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/search/?text=loupe+las33300", + "https://mydomain.com/fr/search/" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/elingue-ronde-haute-resistance/r-PR_G1408003386?q=elingue%7C%7Crelevance%7C%7CmanufacturerNameFacet%7C%7CGISS&text=elingue&classif=", + "https://mydomain.com/fr/elingue-ronde-haute-resistance/r-PR_G1408003386" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/protection-usage-court/c-40-10-21?q=%7C%7Crelevance&page=6", + "https://mydomain.com/fr/protection-usage-court/c-40-10-21" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/ensemble-ebavurage-chanfreinage-en-coffret/p-G1111003763", + "https://mydomain.com/fr/ensemble-ebavurage-chanfreinage-en-coffret/p-G1111003763" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/bac-a-bec/r-PR_G1408000200?q=||||attribute157||Gris||attribute228||225x150x125%20mm||&classif=45&sortAttribute=&sortOrder=", + "https://mydomain.com/fr/bac-a-bec/r-PR_G1408000200" + ); +// decoded -> https://mydomain.com/fr/bac-a-bec/r-PR_G1408000200?q=||||attribute157||Gris||attribute228||225x150x125 mm||&classif=45&sortAttribute=&sortOrder= + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/search/?text=Chaine+9.25+inox", + "https://mydomain.com/fr/search/" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/moraillon-porte-cadenas/p-G1164000013?t=cadenas+pompier", + "https://mydomain.com/fr/moraillon-porte-cadenas/p-G1164000013" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/facom/c-1111?q=%7C%7C%7C%7CcategoryLevel1%7C%7C45&page=5", + "https://mydomain.com/fr/facom/c-1111" + ); + inputUrlToExpectedUrl.put( + "https://www.btshop.nl/nl/veiligheidsschoen-sl-80-blue-esd-s2/r-PR_G1021004878?q=||relevance||attribute891||43&sortAttribute=attribute891,attribute109,attribute5041,attribute157&sortOrder=asc", + "https://www.btshop.nl/nl/veiligheidsschoen-sl-80-blue-esd-s2/r-PR_G1021004878" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/cart#", + "https://mydomain.com/fr/cart#" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/search/?text=smc+aq+240f-06-00", + "https://mydomain.com/fr/search/" + ); + inputUrlToExpectedUrl.put( + "https://es.world.com/es/herramientas-de-mano/c-35-10?q=||relevance||manufacturerNameFacet||Roebuck||categoryLevel3%7C%7C35-10-20", + "https://es.world.com/es/herramientas-de-mano/c-35-10" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/search/?text=piquage%20&q=piquage%20||relevance||manufacturerNameFacet||Parker%20Legris||attribute1875%7C%7CM5%20%27%27", + "https://mydomain.com/search/" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/MyESC/modules/congress/Abstract/AbstractContent.aspx?abstractID=9GrIawodOyWmVAXM%2b9Bq3eJFWUiAKhB2Toh3Oct0zIH%2fCbISTIls4l4Ox45ROTAWHCUzXjOonos%3d", + "https://mydomain.com/MyESC/modules/congress/Abstract/AbstractContent.aspx" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Congress/ESC-CONGRESS-2019/Expert-Advice-Tips-and-tricks-in-imaging-your-patient-with-valvular-heart-d/189866-tips-and-tricks-for-imaging-in-aortic-stenosis#video", + "https://mydomain.com/Congress/ESC-CONGRESS-2019/Expert-Advice-Tips-and-tricks-in-imaging-your-patient-with-valvular-heart-d/189866-tips-and-tricks-for-imaging-in-aortic-stenosis#video" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Councils/Council-on-Hypertension-(CHT)/News/position-statement-of-the-world-council-on-hypertension-on-ace-inhibitors-and-ang", + "https://mydomain.com/Councils/Council-on-Hypertension-(CHT)/News/position-statement-of-the-world-council-on-hypertension-on-ace-inhibitors-and-ang" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Journals/E-Journal-of-Cardiology-Practice/Volume-14/Treatment-of-right-heart-failure-is-there-a-solution-to-the-problem", + "https://mydomain.com/Journals/E-Journal-of-Cardiology-Practice/Volume-14/Treatment-of-right-heart-failure-is-there-a-solution-to-the-problem" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Congress/EuroCMR-2019/Special-Course-8-how-to-read-CMR-for-extra-cardiac-findings/187189-how-to-interpret-common-organ-specific-findings-in-the-lungs-skeletal-system-liver-kidneys-breast-case-based-interactive-discussion#slide", + "https://mydomain.com/Congress/EuroCMR-2019/Special-Course-8-how-to-read-CMR-for-extra-cardiac-findings/187189-how-to-interpret-common-organ-specific-findings-in-the-lungs-skeletal-system-liver-kidneys-breast-case-based-interactive-discussion#slide" + ); + inputUrlToExpectedUrl.put( + "https://my--domain.force.org/apex/ESCUserProfileInfo", + "https://my--domain.force.org/apex/ESCUserProfileInfo" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Education/COVID-19-and-Cardiology/ESC-COVID-19-Guidance?hit=home&urlorig=/vgn-ext-templating/#tbl07", + "https://mydomain.com/Education/COVID-19-and-Cardiology/ESC-COVID-19-Guidance#tbl07" + ); + inputUrlToExpectedUrl.put( + "https://my--domain.force.org/ESCMyProfile?utm_medium=Email&utm_source=&utm_campaign=ESC+-+ESC+Congress+2020+-+registration+confirmation#", + "https://my--domain.force.org/ESCMyProfile#" + ); + inputUrlToExpectedUrl.put( + "http://spo.hurence.org/default.aspx?eevtid=1482&showResults=False&_ga=2.267591435.481963044.1578999296-479375258.1578999295", + "http://spo.hurence.org/default.aspx" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Congresses-&-Events/Frontiers-in-Cardiovascular-Biomedicine/Registration?utm_medium=Email&utm_source=Councils&utm_campaign=Councils+-+FCVB+2020+-+Early+registration+fee+-++Last+call", + "https://mydomain.com/Congresses-&-Events/Frontiers-in-Cardiovascular-Biomedicine/Registration" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/MyESC/modules/congress/Abstract/AbstractAuthors.aspx?abstractID=9GrIawodOyXZLPpXgJHtvCxG5gTt5TznJt97rA1Jy%2bzH7V5eLZVqUnyoo903fiw9nf7mbxKuI14%3d", + "https://mydomain.com/MyESC/modules/congress/Abstract/AbstractAuthors.aspx" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Congresses-&-Events/ESC-Congress/Scientific-sessions", + "https://mydomain.com/Congresses-&-Events/ESC-Congress/Scientific-sessions" + ); + inputUrlToExpectedUrl.put( + "https://idp.hurence.org/idp/login.jsp?loginFailed=true&actionUrl=%2Fidp%2FAuthn%2FESCUserPassword", + "https://idp.hurence.org/idp/login.jsp" + ); + inputUrlToExpectedUrl.put( + "https://aa.net/2016/formulaResult.aspx?model=europelow&exam=&patient=370532", + "https://aa.net/2016/formulaResult.aspx" + ); + inputUrlToExpectedUrl.put( + "https://aa.net/login.aspx?ReturnUrl=%2fpercutaneous-interventions%2fhomepage.aspx", + "https://aa.net/login.aspx" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Education/Practice-Tools/EACVI-toolboxes/3D-Echo/Atlas-of-Three%E2%80%93dimensional-Echocardiography/Volumes-and-Ejection-Fraction", + "https://mydomain.com/Education/Practice-Tools/EACVI-toolboxes/3D-Echo/Atlas-of-Three%E2%80%93dimensional-Echocardiography/Volumes-and-Ejection-Fraction" + ); + inputUrlToExpectedUrl.put( + "https://my--domain.force.org/ESCMyPublications", + "https://my--domain.force.org/ESCMyPublications" + ); + runTestWithRemoveAll(inputUrlToExpectedUrl); + } + + @Test + public void bulkRemoveQTest() { + Map inputUrlToExpectedUrl = new HashMap(); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/search/?text=loupe+las33300", + "https://mydomain.com/fr/search/?text=loupe+las33300" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/elingue-ronde-haute-resistance/r-PR_G1408003386?q=elingue%7C%7Crelevance%7C%7CmanufacturerNameFacet%7C%7CGISS&text=elingue&classif=", + "https://mydomain.com/fr/elingue-ronde-haute-resistance/r-PR_G1408003386?text=elingue&classif=" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/protection-usage-court/c-40-10-21?q=%7C%7Crelevance&page=6", + "https://mydomain.com/fr/protection-usage-court/c-40-10-21?page=6" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/ensemble-ebavurage-chanfreinage-en-coffret/p-G1111003763", + "https://mydomain.com/fr/ensemble-ebavurage-chanfreinage-en-coffret/p-G1111003763" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/bac-a-bec/r-PR_G1408000200?q=||||attribute157||Gris||attribute228||225x150x125%20mm||&classif=45&sortAttribute=&sortOrder=", + "https://mydomain.com/fr/bac-a-bec/r-PR_G1408000200?classif=45&sortAttribute=&sortOrder=" + ); +// decoded -> https://mydomain.com/fr/bac-a-bec/r-PR_G1408000200?q=||||attribute157||Gris||attribute228||225x150x125 mm||&classif=45&sortAttribute=&sortOrder= + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/search/?text=Chaine+9.25+inox", + "https://mydomain.com/fr/search/?text=Chaine+9.25+inox" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/moraillon-porte-cadenas/p-G1164000013?t=cadenas+pompier", + "https://mydomain.com/fr/moraillon-porte-cadenas/p-G1164000013?t=cadenas+pompier" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/facom/c-1111?q=%7C%7C%7C%7CcategoryLevel1%7C%7C45&page=5", + "https://mydomain.com/fr/facom/c-1111?page=5" + ); + inputUrlToExpectedUrl.put( + "https://www.btshop.nl/nl/veiligheidsschoen-sl-80-blue-esd-s2/r-PR_G1021004878?q=||relevance||attribute891||43&sortAttribute=attribute891,attribute109,attribute5041,attribute157&sortOrder=asc", + "https://www.btshop.nl/nl/veiligheidsschoen-sl-80-blue-esd-s2/r-PR_G1021004878?sortAttribute=attribute891,attribute109,attribute5041,attribute157&sortOrder=asc" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/cart#", + "https://mydomain.com/fr/cart#" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/search/?text=smc+aq+240f-06-00", + "https://mydomain.com/fr/search/?text=smc+aq+240f-06-00" + ); + inputUrlToExpectedUrl.put( + "https://es.world.com/es/herramientas-de-mano/c-35-10?q=||relevance||manufacturerNameFacet||Roebuck||categoryLevel3%7C%7C35-10-20", + "https://es.world.com/es/herramientas-de-mano/c-35-10" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/search/?text=piquage%20&q=piquage%20||relevance||manufacturerNameFacet||Parker%20Legris||attribute1875%7C%7CM5%20%27%27", + "https://mydomain.com/search/?text=piquage%20" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/MyESC/modules/congress/Abstract/AbstractContent.aspx?abstractID=9GrIawodOyWmVAXM%2b9Bq3eJFWUiAKhB2Toh3Oct0zIH%2fCbISTIls4l4Ox45ROTAWHCUzXjOonos%3d", + "https://mydomain.com/MyESC/modules/congress/Abstract/AbstractContent.aspx?abstractID=9GrIawodOyWmVAXM%2b9Bq3eJFWUiAKhB2Toh3Oct0zIH%2fCbISTIls4l4Ox45ROTAWHCUzXjOonos%3d" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Congress/ESC-CONGRESS-2019/Expert-Advice-Tips-and-tricks-in-imaging-your-patient-with-valvular-heart-d/189866-tips-and-tricks-for-imaging-in-aortic-stenosis#video", + "https://mydomain.com/Congress/ESC-CONGRESS-2019/Expert-Advice-Tips-and-tricks-in-imaging-your-patient-with-valvular-heart-d/189866-tips-and-tricks-for-imaging-in-aortic-stenosis#video" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Councils/Council-on-Hypertension-(CHT)/News/position-statement-of-the-world-council-on-hypertension-on-ace-inhibitors-and-ang", + "https://mydomain.com/Councils/Council-on-Hypertension-(CHT)/News/position-statement-of-the-world-council-on-hypertension-on-ace-inhibitors-and-ang" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Journals/E-Journal-of-Cardiology-Practice/Volume-14/Treatment-of-right-heart-failure-is-there-a-solution-to-the-problem", + "https://mydomain.com/Journals/E-Journal-of-Cardiology-Practice/Volume-14/Treatment-of-right-heart-failure-is-there-a-solution-to-the-problem" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Congress/EuroCMR-2019/Special-Course-8-how-to-read-CMR-for-extra-cardiac-findings/187189-how-to-interpret-common-organ-specific-findings-in-the-lungs-skeletal-system-liver-kidneys-breast-case-based-interactive-discussion#slide", + "https://mydomain.com/Congress/EuroCMR-2019/Special-Course-8-how-to-read-CMR-for-extra-cardiac-findings/187189-how-to-interpret-common-organ-specific-findings-in-the-lungs-skeletal-system-liver-kidneys-breast-case-based-interactive-discussion#slide" + ); + inputUrlToExpectedUrl.put( + "https://my--domain.force.org/apex/ESCUserProfileInfo", + "https://my--domain.force.org/apex/ESCUserProfileInfo" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Education/COVID-19-and-Cardiology/ESC-COVID-19-Guidance?hit=home&urlorig=/vgn-ext-templating/#tbl07", + "https://mydomain.com/Education/COVID-19-and-Cardiology/ESC-COVID-19-Guidance?hit=home&urlorig=/vgn-ext-templating/#tbl07" + ); + inputUrlToExpectedUrl.put( + "https://my--domain.force.org/ESCMyProfile?utm_medium=Email&utm_source=&utm_campaign=ESC+-+ESC+Congress+2020+-+registration+confirmation#", + "https://my--domain.force.org/ESCMyProfile?utm_medium=Email&utm_source=&utm_campaign=ESC+-+ESC+Congress+2020+-+registration+confirmation#" + ); + inputUrlToExpectedUrl.put( + "http://spo.hurence.org/default.aspx?eevtid=1482&showResults=False&_ga=2.267591435.481963044.1578999296-479375258.1578999295", + "http://spo.hurence.org/default.aspx?eevtid=1482&showResults=False&_ga=2.267591435.481963044.1578999296-479375258.1578999295" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Congresses-&-Events/Frontiers-in-Cardiovascular-Biomedicine/Registration?utm_medium=Email&utm_source=Councils&utm_campaign=Councils+-+FCVB+2020+-+Early+registration+fee+-++Last+call", + "https://mydomain.com/Congresses-&-Events/Frontiers-in-Cardiovascular-Biomedicine/Registration?utm_medium=Email&utm_source=Councils&utm_campaign=Councils+-+FCVB+2020+-+Early+registration+fee+-++Last+call" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/MyESC/modules/congress/Abstract/AbstractAuthors.aspx?abstractID=9GrIawodOyXZLPpXgJHtvCxG5gTt5TznJt97rA1Jy%2bzH7V5eLZVqUnyoo903fiw9nf7mbxKuI14%3d", + "https://mydomain.com/MyESC/modules/congress/Abstract/AbstractAuthors.aspx?abstractID=9GrIawodOyXZLPpXgJHtvCxG5gTt5TznJt97rA1Jy%2bzH7V5eLZVqUnyoo903fiw9nf7mbxKuI14%3d" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Congresses-&-Events/ESC-Congress/Scientific-sessions", + "https://mydomain.com/Congresses-&-Events/ESC-Congress/Scientific-sessions" + ); + inputUrlToExpectedUrl.put( + "https://idp.hurence.org/idp/login.jsp?loginFailed=true&actionUrl=%2Fidp%2FAuthn%2FESCUserPassword", + "https://idp.hurence.org/idp/login.jsp?loginFailed=true&actionUrl=%2Fidp%2FAuthn%2FESCUserPassword" + ); + inputUrlToExpectedUrl.put( + "https://aa.net/2016/formulaResult.aspx?model=europelow&exam=&patient=370532", + "https://aa.net/2016/formulaResult.aspx?model=europelow&exam=&patient=370532" + ); + inputUrlToExpectedUrl.put( + "https://aa.net/login.aspx?ReturnUrl=%2fpercutaneous-interventions%2fhomepage.aspx", + "https://aa.net/login.aspx?ReturnUrl=%2fpercutaneous-interventions%2fhomepage.aspx" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Education/Practice-Tools/EACVI-toolboxes/3D-Echo/Atlas-of-Three%E2%80%93dimensional-Echocardiography/Volumes-and-Ejection-Fraction", + "https://mydomain.com/Education/Practice-Tools/EACVI-toolboxes/3D-Echo/Atlas-of-Three%E2%80%93dimensional-Echocardiography/Volumes-and-Ejection-Fraction" + ); + inputUrlToExpectedUrl.put( + "https://my--domain.force.org/ESCMyPublications", + "https://my--domain.force.org/ESCMyPublications" + ); + runTestWithRemoveQ(inputUrlToExpectedUrl); + } + + + @Test + public void bulkKeepQTest() { + Map inputUrlToExpectedUrl = new HashMap(); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/search/?text=loupe+las33300", + "https://mydomain.com/fr/search/" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/elingue-ronde-haute-resistance/r-PR_G1408003386?q=elingue%7C%7Crelevance%7C%7CmanufacturerNameFacet%7C%7CGISS&text=elingue&classif=", + "https://mydomain.com/fr/elingue-ronde-haute-resistance/r-PR_G1408003386?q=elingue%7C%7Crelevance%7C%7CmanufacturerNameFacet%7C%7CGISS" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/protection-usage-court/c-40-10-21?q=%7C%7Crelevance&page=6", + "https://mydomain.com/fr/protection-usage-court/c-40-10-21?q=%7C%7Crelevance" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/ensemble-ebavurage-chanfreinage-en-coffret/p-G1111003763", + "https://mydomain.com/fr/ensemble-ebavurage-chanfreinage-en-coffret/p-G1111003763" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/bac-a-bec/r-PR_G1408000200?q=||||attribute157||Gris||attribute228||225x150x125%20mm||&classif=45&sortAttribute=&sortOrder=", + "https://mydomain.com/fr/bac-a-bec/r-PR_G1408000200?q=||||attribute157||Gris||attribute228||225x150x125%20mm||" + ); +// decoded -> https://mydomain.com/fr/bac-a-bec/r-PR_G1408000200?q=||||attribute157||Gris||attribute228||225x150x125 mm||&classif=45&sortAttribute=&sortOrder= + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/search/?text=Chaine+9.25+inox", + "https://mydomain.com/fr/search/" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/moraillon-porte-cadenas/p-G1164000013?t=cadenas+pompier", + "https://mydomain.com/fr/moraillon-porte-cadenas/p-G1164000013" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/facom/c-1111?q=%7C%7C%7C%7CcategoryLevel1%7C%7C45&page=5", + "https://mydomain.com/fr/facom/c-1111?q=%7C%7C%7C%7CcategoryLevel1%7C%7C45" + ); + inputUrlToExpectedUrl.put( + "https://www.btshop.nl/nl/veiligheidsschoen-sl-80-blue-esd-s2/r-PR_G1021004878?q=||relevance||attribute891||43&sortAttribute=attribute891,attribute109,attribute5041,attribute157&sortOrder=asc", + "https://www.btshop.nl/nl/veiligheidsschoen-sl-80-blue-esd-s2/r-PR_G1021004878?q=||relevance||attribute891||43" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/cart#", + "https://mydomain.com/fr/cart#" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/fr/search/?text=smc+aq+240f-06-00", + "https://mydomain.com/fr/search/" + ); + inputUrlToExpectedUrl.put( + "https://es.world.com/es/herramientas-de-mano/c-35-10?q=||relevance||manufacturerNameFacet||Roebuck||categoryLevel3%7C%7C35-10-20", + "https://es.world.com/es/herramientas-de-mano/c-35-10?q=||relevance||manufacturerNameFacet||Roebuck||categoryLevel3%7C%7C35-10-20" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/search/?text=piquage%20&q=piquage%20||relevance||manufacturerNameFacet||Parker%20Legris||attribute1875%7C%7CM5%20%27%27", + "https://mydomain.com/search/?q=piquage%20||relevance||manufacturerNameFacet||Parker%20Legris||attribute1875%7C%7CM5%20%27%27" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/MyESC/modules/congress/Abstract/AbstractContent.aspx?abstractID=9GrIawodOyWmVAXM%2b9Bq3eJFWUiAKhB2Toh3Oct0zIH%2fCbISTIls4l4Ox45ROTAWHCUzXjOonos%3d", + "https://mydomain.com/MyESC/modules/congress/Abstract/AbstractContent.aspx" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Congress/ESC-CONGRESS-2019/Expert-Advice-Tips-and-tricks-in-imaging-your-patient-with-valvular-heart-d/189866-tips-and-tricks-for-imaging-in-aortic-stenosis#video", + "https://mydomain.com/Congress/ESC-CONGRESS-2019/Expert-Advice-Tips-and-tricks-in-imaging-your-patient-with-valvular-heart-d/189866-tips-and-tricks-for-imaging-in-aortic-stenosis#video" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Councils/Council-on-Hypertension-(CHT)/News/position-statement-of-the-world-council-on-hypertension-on-ace-inhibitors-and-ang", + "https://mydomain.com/Councils/Council-on-Hypertension-(CHT)/News/position-statement-of-the-world-council-on-hypertension-on-ace-inhibitors-and-ang" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Journals/E-Journal-of-Cardiology-Practice/Volume-14/Treatment-of-right-heart-failure-is-there-a-solution-to-the-problem", + "https://mydomain.com/Journals/E-Journal-of-Cardiology-Practice/Volume-14/Treatment-of-right-heart-failure-is-there-a-solution-to-the-problem" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Congress/EuroCMR-2019/Special-Course-8-how-to-read-CMR-for-extra-cardiac-findings/187189-how-to-interpret-common-organ-specific-findings-in-the-lungs-skeletal-system-liver-kidneys-breast-case-based-interactive-discussion#slide", + "https://mydomain.com/Congress/EuroCMR-2019/Special-Course-8-how-to-read-CMR-for-extra-cardiac-findings/187189-how-to-interpret-common-organ-specific-findings-in-the-lungs-skeletal-system-liver-kidneys-breast-case-based-interactive-discussion#slide" + ); + inputUrlToExpectedUrl.put( + "https://my--domain.force.org/apex/ESCUserProfileInfo", + "https://my--domain.force.org/apex/ESCUserProfileInfo" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Education/COVID-19-and-Cardiology/ESC-COVID-19-Guidance?hit=home&urlorig=/vgn-ext-templating/#tbl07", + "https://mydomain.com/Education/COVID-19-and-Cardiology/ESC-COVID-19-Guidance#tbl07" + ); + inputUrlToExpectedUrl.put( + "https://my--domain.force.org/ESCMyProfile?utm_medium=Email&utm_source=&utm_campaign=ESC+-+ESC+Congress+2020+-+registration+confirmation#", + "https://my--domain.force.org/ESCMyProfile#" + ); + inputUrlToExpectedUrl.put( + "http://spo.hurence.org/default.aspx?eevtid=1482&showResults=False&_ga=2.267591435.481963044.1578999296-479375258.1578999295", + "http://spo.hurence.org/default.aspx" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Congresses-&-Events/Frontiers-in-Cardiovascular-Biomedicine/Registration?utm_medium=Email&utm_source=Councils&utm_campaign=Councils+-+FCVB+2020+-+Early+registration+fee+-++Last+call", + "https://mydomain.com/Congresses-&-Events/Frontiers-in-Cardiovascular-Biomedicine/Registration" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/MyESC/modules/congress/Abstract/AbstractAuthors.aspx?abstractID=9GrIawodOyXZLPpXgJHtvCxG5gTt5TznJt97rA1Jy%2bzH7V5eLZVqUnyoo903fiw9nf7mbxKuI14%3d", + "https://mydomain.com/MyESC/modules/congress/Abstract/AbstractAuthors.aspx" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Congresses-&-Events/ESC-Congress/Scientific-sessions", + "https://mydomain.com/Congresses-&-Events/ESC-Congress/Scientific-sessions" + ); + inputUrlToExpectedUrl.put( + "https://idp.hurence.org/idp/login.jsp?loginFailed=true&actionUrl=%2Fidp%2FAuthn%2FESCUserPassword", + "https://idp.hurence.org/idp/login.jsp" + ); + inputUrlToExpectedUrl.put( + "https://aa.net/2016/formulaResult.aspx?model=europelow&exam=&patient=370532", + "https://aa.net/2016/formulaResult.aspx" + ); + inputUrlToExpectedUrl.put( + "https://aa.net/login.aspx?ReturnUrl=%2fpercutaneous-interventions%2fhomepage.aspx", + "https://aa.net/login.aspx" + ); + inputUrlToExpectedUrl.put( + "https://mydomain.com/Education/Practice-Tools/EACVI-toolboxes/3D-Echo/Atlas-of-Three%E2%80%93dimensional-Echocardiography/Volumes-and-Ejection-Fraction", + "https://mydomain.com/Education/Practice-Tools/EACVI-toolboxes/3D-Echo/Atlas-of-Three%E2%80%93dimensional-Echocardiography/Volumes-and-Ejection-Fraction" + ); + inputUrlToExpectedUrl.put( + "https://my--domain.force.org/ESCMyPublications", + "https://my--domain.force.org/ESCMyPublications" + ); + runTestWithKeepQ(inputUrlToExpectedUrl); + } + private void runTestWithRemoveAll(Map inputUrlToExpectedUrl) { + Map conf = new HashMap<>(); + conf.put(URLCleaner.REMOVE_ALL_PARAMS, "true"); + runTestWithConfig(inputUrlToExpectedUrl, conf); + } + + private void runTestWithRemoveQ(Map inputUrlToExpectedUrl) { + Map conf = new HashMap<>(); + conf.put(URLCleaner.REMOVE_PARAMS, "q"); + runTestWithConfig(inputUrlToExpectedUrl, conf); + } + + private void runTestWithKeepQ(Map inputUrlToExpectedUrl) { + Map conf = new HashMap<>(); + conf.put(URLCleaner.KEEP_PARAMS, "q"); + runTestWithConfig(inputUrlToExpectedUrl, conf); + } + + private void runTestWithConfig(Map inputUrlToExpectedUrl, Map conf) { + List fieldsNames = IntStream.range(1, inputUrlToExpectedUrl.size() + 1) + .mapToObj(i -> "url" + i) + .collect(Collectors.toList()); + List inputValues = new ArrayList<>(inputUrlToExpectedUrl.keySet()); + Record record = buildRecordFromMap(fieldsNames, inputValues); + final Record myCopyOfInitialRecord = new StandardRecord(record); + + TestRunner testRunner = TestRunners.newTestRunner(new URLCleaner()); + testRunner.setProperty(URLCleaner.URL_FIELDS, String.join(",", fieldsNames)); + testRunner.setProperty(URLCleaner.CONFLICT_RESOLUTION_POLICY, URLCleaner.OVERWRITE_EXISTING); + conf.entrySet().forEach(kv -> { + testRunner.setProperty(kv.getKey(), kv.getValue()); + }); + testRunner.assertValid(); + testRunner.enqueue(record); + testRunner.run(); + testRunner.assertAllInputRecordsProcessed(); + testRunner.assertOutputRecordsCount(1); + + MockRecord out = testRunner.getOutputRecords().get(0); + out.assertRecordSizeEquals(record.size()); + + fieldsNames.forEach(fieldName -> { + String inputValue = myCopyOfInitialRecord.getField(fieldName).asString(); + String expectedValue = inputUrlToExpectedUrl.get(inputValue); + out.assertFieldEquals(fieldName, expectedValue); + }); + } + + private Record buildRecordFromMap(List fieldsNames, List values) { + if (fieldsNames.size() != values.size()) throw new IllegalArgumentException("list should be of same size"); + final Record record = new StandardRecord(); + IntStream + .range(0, fieldsNames.size()) + .forEach(i -> { + record.setStringField(fieldsNames.get(i), values.get(i)); + }); + return record; + } } From f9c679f61d10a2cfa2229ce2e6e62c945dd0f38f Mon Sep 17 00:00:00 2001 From: Gregoire Seguin-Henry Date: Fri, 4 Dec 2020 13:50:33 +0100 Subject: [PATCH 17/17] Made some proprities of URLCleaner not required. --- .../hurence/logisland/processor/webanalytics/URLCleaner.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URLCleaner.java b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URLCleaner.java index 61ce9f295..40e57f22e 100644 --- a/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URLCleaner.java +++ b/logisland-components/logisland-processors/logisland-processor-web-analytics/src/main/java/com/hurence/logisland/processor/webanalytics/URLCleaner.java @@ -106,7 +106,7 @@ public class URLCleaner extends AbstractProcessor { public static final PropertyDescriptor PARAMETER_SEPARATOR = new PropertyDescriptor.Builder() .name("parameter.separator") .description("the character to use to separate the parameters in the query part of the uris") - .required(true) + .required(false) .defaultValue("&") .addValidator(StandardValidators.CHAR_VALIDATOR) .build(); @@ -114,7 +114,7 @@ public class URLCleaner extends AbstractProcessor { public static final PropertyDescriptor KEY_VALUE_SEPARATOR = new PropertyDescriptor.Builder() .name("key.value.separator") .description("the character to use to separate the parameter name from the parameter value in the query part of the uris") - .required(true) + .required(false) .defaultValue("=") .addValidator(StandardValidators.CHAR_VALIDATOR) .build();