Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/url cleanner #568

Merged
merged 17 commits into from
Dec 18, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,12 @@
<optional>true</optional>
</dependency>


<dependency>
<groupId>com.hurence.logisland</groupId>
<artifactId>logisland-processor-common</artifactId>
<version>${project.version}</version>
<scope>test</scope>
</dependency>

</dependencies>

<build>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hurence.logisland.processor.webAnalytics;
package com.hurence.logisland.processor.webanalytics;

import com.hurence.logisland.annotation.documentation.CapabilityDescription;
import com.hurence.logisland.annotation.documentation.ExtraDetailFile;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hurence.logisland.processor.webAnalytics;
package com.hurence.logisland.processor.webanalytics;

import com.hurence.logisland.annotation.documentation.*;
import com.hurence.logisland.classloading.PluginProxy;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hurence.logisland.processor.webAnalytics;
package com.hurence.logisland.processor.webanalytics;

import com.hurence.logisland.annotation.documentation.*;
import com.hurence.logisland.classloading.PluginProxy;
Expand All @@ -25,21 +25,21 @@
import com.hurence.logisland.record.FieldType;
import com.hurence.logisland.record.Record;
import com.hurence.logisland.service.cache.CacheService;
import com.hurence.logisland.service.elasticsearch.ElasticsearchClientService;
import com.hurence.logisland.service.datastore.InvalidMultiGetQueryRecordException;
import com.hurence.logisland.service.datastore.MultiGetQueryRecord;
import com.hurence.logisland.service.datastore.MultiGetQueryRecordBuilder;
import com.hurence.logisland.service.datastore.MultiGetResponseRecord;
import com.hurence.logisland.service.elasticsearch.ElasticsearchClientService;
import com.hurence.logisland.validator.StandardValidators;
import org.apache.commons.collections.map.HashedMap;

import java.io.UnsupportedEncodingException;
import java.net.*;
import java.net.URLDecoder;
import java.net.*;
import java.util.*;
import java.util.regex.Pattern;

import static com.hurence.logisland.processor.webAnalytics.SetSourceOfTraffic.*;
import static com.hurence.logisland.processor.webanalytics.SetSourceOfTraffic.*;

@Category(ComponentCategory.ANALYTICS)
@Tags({"session", "traffic", "source", "web", "analytics"})
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
/**
* Copyright (C) 2020 Hurence (support@hurence.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hurence.logisland.processor.webanalytics;

import com.hurence.logisland.annotation.documentation.CapabilityDescription;
import com.hurence.logisland.annotation.documentation.ExtraDetailFile;
import com.hurence.logisland.annotation.documentation.Tags;
import com.hurence.logisland.component.InitializationException;
import com.hurence.logisland.component.PropertyDescriptor;
import com.hurence.logisland.processor.AbstractProcessor;
import com.hurence.logisland.processor.ProcessContext;
import com.hurence.logisland.processor.ProcessError;
import com.hurence.logisland.record.FieldType;
import com.hurence.logisland.record.Record;
import com.hurence.logisland.validator.StandardValidators;

import java.net.URI;
import java.util.*;

@Tags({"record", "fields", "Decode"})
@CapabilityDescription("Decode one or more field containing an URI with possibly special chars encoded\n" +
"...")
@ExtraDetailFile("./details/URLDecoder-Detail.rst")
public class URIDecoder extends AbstractProcessor {

private static final PropertyDescriptor FIELDS_TO_DECODE_PROP = new PropertyDescriptor.Builder()
.name("decode.fields")
.description("List of fields (URL) to decode")
.required(true)
.addValidator(StandardValidators.COMMA_SEPARATED_LIST_VALIDATOR)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();

private Set<String> fieldsToDecode;

@Override
public List<PropertyDescriptor> getSupportedPropertyDescriptors() {
final List<PropertyDescriptor> descriptors = new ArrayList<>();
descriptors.add(FIELDS_TO_DECODE_PROP);
return Collections.unmodifiableList(descriptors);
}


public void init(ProcessContext context) throws InitializationException {
super.init(context);
initFieldsToDecode(context);
}

public void initFieldsToDecode(ProcessContext context) {
String commaSeparatedFields = context.getPropertyValue(FIELDS_TO_DECODE_PROP).asString();
String[] fieldsArr = commaSeparatedFields.split("\\s*,\\s*");
fieldsToDecode = new HashSet();
Collections.addAll(fieldsToDecode, fieldsArr);
}

@Override
public Collection<Record> process(ProcessContext context, Collection<Record> records) {
for (Record record : records) {
updateRecord(record);
}
return records;
}


private void updateRecord(Record record) {
fieldsToDecode.forEach(fieldName -> {
if (record.hasField(fieldName)) {
String uriStr = record.getField(fieldName).asString();
if (uriStr != null && !uriStr.isEmpty()) {
decode(uriStr, record, fieldName);
}
}
});
}

private void decode(String uriStr, Record record, String fieldNameToSetDecodedUri)
{
try {
String decodedURI = uriToDecodedString(new URI(uriStr));
if (!decodedURI.equals(uriStr)) {
final FieldType fieldType = record.getField(fieldNameToSetDecodedUri).getType();
record.removeField(fieldNameToSetDecodedUri);
record.setField(fieldNameToSetDecodedUri, fieldType, decodedURI);
}
} catch (Exception e){
getLogger().error("Error while trying to decode uri {}, for record {}.", new Object[]{uriStr, record.getId()}, e);
String msg = "Could not process uri : '" + uriStr + "'.\n Cause: " + e.getMessage();
record.addError(ProcessError.STRING_FORMAT_ERROR.toString(), getLogger(), msg);
}
}


/**A URI is like
[<scheme>:]<scheme-specific-part>[#<fragment>]
@see URI
*/
private String uriToDecodedString(URI uri) {
String uriStr = "";
if (uri.getScheme() != null && !uri.getScheme().isEmpty()) {
uriStr += uri.getScheme() + ":";
}
if (uri.getSchemeSpecificPart() != null && !uri.getSchemeSpecificPart().isEmpty()) {
uriStr += uri.getSchemeSpecificPart();
}
if (uri.getFragment() != null && !uri.getFragment().isEmpty()) {
uriStr += "#" + uri.getFragment();
}
return uriStr;
}
}
Loading