-
Notifications
You must be signed in to change notification settings - Fork 0
/
Preprocessing
65 lines (54 loc) · 2.25 KB
/
Preprocessing
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
def generate_kmers(sequence, k):
"""
Generates k-mers for a given sequence.
"""
return [sequence[i:i+k] for i in range(len(sequence) - k + 1)]
def read_and_process_data(file_path, k):
sequences = []
labels = []
with open(file_path, 'r') as file:
lines = file.readlines()
print("Number of lines read:", len(lines)) # Debug print
for line in lines:
# Trim and check if the line is empty or looks like a header
trimmed_line = line.strip()
if not trimmed_line or 'sequence' in trimmed_line:
continue # Skip empty lines or header
# Split the line into sequence and label parts
parts = trimmed_line.split('\t')
if len(parts) != 2:
print("Skipped line (improper format):", trimmed_line) # Debug print
continue
sequence, label_part = parts
try:
label = int(label_part)
sequences.append(sequence)
labels.append(label)
except ValueError:
print("Skipped line due to ValueError in label conversion:", trimmed_line) # Debug print
continue
# Generate k-mers for each sequence
kmer_dicts = [Counter(generate_kmers(seq, k)) for seq in sequences]
# Create the bag-of-words DataFrame
bow_df = pd.DataFrame.from_records(kmer_dicts).fillna(0).astype(int)
bow_df['label'] = labels
return bow_df
# Replace 'path_to_human.txt' with the actual path to your dataset
human_data = read_and_process_data('human.txt', k=4)
print(human_data.head())
# Assuming 'human_data' is your DataFrame
class_distribution = human_data['label'].value_counts()
print("Class Distribution:\n", class_distribution)
# Set the aesthetic style of the plots
sns.set(style="whitegrid")
# Create a bar plot of the class distribution
plt.figure(figsize=(10, 6))
class_distribution_plot = sns.barplot(x=class_distribution.index, y=class_distribution.values)
class_distribution_plot.set_title('Distribution of Classes')
class_distribution_plot.set_xlabel('Class Labels')
class_distribution_plot.set_ylabel('Frequency')
plt.show()