-
Notifications
You must be signed in to change notification settings - Fork 0
/
labels_preprocessing.py
37 lines (28 loc) · 1.29 KB
/
labels_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def preprocess_labels(df):
def extract_value_and_unit(entity_value):
match = re.match(r'(\d+(?:\.\d+)?)\s*(\w+)', str(entity_value))
if match:
value, unit = match.groups()
return float(value), unit.lower()
return None, None
def normalize_unit(unit, entity_name):
allowed_units = ALLOWED_UNITS.get(entity_name, [])
if unit in allowed_units:
return unit
# Here you might want to add unit conversion logic
# For example, converting 'gram' to 'kilogram' if needed
return None
# Extract value and unit
df['value'], df['unit'] = zip(*df['entity_value'].map(extract_value_and_unit))
# Normalize units
df['normalized_unit'] = df.apply(lambda row: normalize_unit(row['unit'], row['entity_name']), axis=1)
# Remove rows with invalid units
df = df.dropna(subset=['normalized_unit'])
return df
# Load the training data
train_df = pd.read_csv('/content/drive/MyDrive/aml_resource/dataset/train.csv')
# Preprocess the labels
preprocessed_train_df = preprocess_labels(train_df)
# Save the preprocessed data
preprocessed_train_df.to_csv('/content/drive/MyDrive/preprocessed/train_labels.csv', index=False)
print("Label preprocessing complete!")