Skip to content

Commit

Permalink
Merge pull request #173 from ufal/refactor-campaign-stats
Browse files Browse the repository at this point in the history
Make computation of campaign statistics more idiomatic and efficient
  • Loading branch information
kasnerz authored Dec 17, 2024
2 parents d9c5fbc + 25349ed commit be4a857
Showing 1 changed file with 78 additions and 128 deletions.
206 changes: 78 additions & 128 deletions factgenie/analysis.py
Original file line number Diff line number Diff line change
@@ -1,83 +1,55 @@
#!/usr/bin/env python3

import re
import glob
import json
import random
import os
import argparse
import pandas as pd
from collections import defaultdict
from scipy.stats import pearsonr
import sys
from pathlib import Path
from slugify import slugify
import logging
import coloredlogs
import traceback
import factgenie.workflows as workflows

from factgenie import CAMPAIGN_DIR

sys.path.append(os.path.join(os.path.dirname(__file__), ".."))

logger = logging.getLogger(__name__)
# coloredlogs.install(level="INFO", logger=logger, fmt="%(asctime)s %(levelname)s %(message)s")


def create_example_record(line, metadata, annotation_span_categories, annotation_records, jsonl_file):
# a record is created even if there are no annotations
j = json.loads(line)

example_record = workflows.create_annotation_example_record(j, jsonl_file)

for i, category in enumerate(annotation_span_categories):
example_record["cat_" + str(i)] = 0
def generate_example_index(app, campaign):
logger.info(f"Preparing example index for campaign {campaign.campaign_id}")

for annotation in j["annotations"]:
if int(annotation["type"]) == i:
example_record["cat_" + str(i)] += 1
annotation_span_categories = campaign.metadata["config"]["annotation_span_categories"]
example_index = workflows.get_annotation_index(app, force_reload=True).copy()

example_record["annotations"] = [
{
"annotation_type": r["annotation_type"],
"annotation_start": r["annotation_start"],
"annotation_text": r["annotation_text"],
}
for r in annotation_records
]
# Add category count columns to example index
for i in range(len(annotation_span_categories)):
col_name = f"cat_{i}"
example_index[col_name] = example_index["annotations"].apply(
lambda anns: sum(1 for a in anns if a["type"] == i)
)

return example_record
return example_index


def load_annotations_for_campaign(campaign):
annotation_index = []
example_index = []
def generate_span_index(app, campaign):
logger.info(f"Preparing span index for campaign {campaign.campaign_id}")

annotation_span_categories = campaign.metadata["config"]["annotation_span_categories"]
span_index = workflows.get_annotation_index(app).copy()

jsonl_files = glob.glob(os.path.join(CAMPAIGN_DIR, campaign.metadata["id"], "files", "*.jsonl"))
# Remove examples with no annotations
span_index = span_index[span_index["annotations"].apply(lambda x: len(x) > 0)]

for jsonl_file in jsonl_files:
with open(jsonl_file) as f:
lines = f.readlines()
for line in lines:
try:
annotation_records = workflows.load_annotations_from_record(line, split_spans=True)
annotation_index += annotation_records
# Create a separate row for each annotation
span_index = span_index.explode("annotations").reset_index(drop=True)

example_record = create_example_record(
line, campaign.metadata, annotation_span_categories, annotation_records, jsonl_file
)
example_index.append(example_record)
except Exception as e:
logger.error(f"Error while processing line: {line}")
logger.error(e)
# Extract annotation fields into separate columns
span_index["annotation_type"] = span_index["annotations"].apply(lambda x: x["type"])
span_index["annotation_start"] = span_index["annotations"].apply(lambda x: x["start"])
span_index["annotation_text"] = span_index["annotations"].apply(lambda x: x["text"])

annotation_index = pd.DataFrame(annotation_index)
example_index = pd.DataFrame(example_index)
# Drop the original annotations column
span_index = span_index.drop("annotations", axis=1)

return annotation_index, example_index
return span_index


def preprocess_annotations(df, campaign):
Expand All @@ -100,89 +72,63 @@ def compute_ann_counts(df):
"""
Compute annotation counts for each annotation type (separately for each dataset, split, setup_id).
"""
results = []

all_annotation_types = df["annotation_type"].unique()
all_annotation_types.sort()

for dataset in df["dataset"].unique():
for split in df["split"].unique():
for setup_id in df["setup_id"].unique():
# filter the dataframe
df_filtered = df[(df["dataset"] == dataset) & (df["split"] == split) & (df["setup_id"] == setup_id)]

# make sure that all annotation types are present in the dataframe, even with zero counts
ann_counts = (
df_filtered.groupby("annotation_type")
.size()
.reindex(all_annotation_types, fill_value=0)
.reset_index(name="ann_count")
)
logger.info("Computing annotation counts")

ann_counts["dataset"] = dataset
ann_counts["split"] = split
ann_counts["setup_id"] = setup_id
# Create multi-index groupby once
grouped = df.groupby(["dataset", "split", "setup_id", "annotation_type"]).size().reset_index(name="ann_count")

results.append(ann_counts)
# Create complete multi-index for all combinations
idx = pd.MultiIndex.from_product(
[df["dataset"].unique(), df["split"].unique(), df["setup_id"].unique(), sorted(df["annotation_type"].unique())],
names=["dataset", "split", "setup_id", "annotation_type"],
)

# concatenate all results into a single dataframe
results = pd.concat(results, ignore_index=True)
# Reindex to include all combinations with zeros
results = (
grouped.set_index(["dataset", "split", "setup_id", "annotation_type"]).reindex(idx, fill_value=0).reset_index()
)

return results


def compute_avg_ann_counts(ann_counts, example_index):
# for each line in ann_counts, find the corresponding dataset in datasets and add the number of examples
# then compute the average annotation count

# add a column with the number of examples for each dataset, split
ann_counts["example_count"] = 0

for i, row in ann_counts.iterrows():
dataset = row["dataset"]
split = row["split"]
setup_id = row["setup_id"]

ann_counts.loc[i, "example_count"] = (
example_index[
(example_index["dataset"] == dataset)
& (example_index["split"] == split)
& (example_index["setup_id"] == setup_id)
]
.example_idx.unique()
.shape[0]
)
logger.info("Computing average annotation counts")

# Get example counts through groupby operation
example_counts = (
example_index.groupby(["dataset", "split", "setup_id"])
.agg(example_count=("example_idx", "nunique"))
.reset_index()
.astype({"example_count": int})
)

ann_counts["avg_count"] = ann_counts["ann_count"] / ann_counts["example_count"]
# Merge counts with original dataframe
ann_counts = ann_counts.merge(example_counts, on=["dataset", "split", "setup_id"], how="left")

# round to three decimal places
ann_counts["avg_count"] = ann_counts["avg_count"].round(3)
# Compute average counts vectorized
ann_counts["avg_count"] = (ann_counts["ann_count"] / ann_counts["example_count"]).round(3)

return ann_counts


def compute_prevalence(ann_counts, example_index):
# for each combination of dataset, split, setup_id, annotation_type, compute the percentage of examples that are affected by the annotation type and add it to the `ann_counts` dataframe
for i, row in ann_counts.iterrows():
dataset = row["dataset"]
split = row["split"]
setup_id = row["setup_id"]
annotation_type = row["annotation_type"]

examples = example_index[
(example_index["dataset"] == dataset)
& (example_index["split"] == split)
& (example_index["setup_id"] == setup_id)
& (example_index["cat_" + str(annotation_type)] > 0)
]

if row["example_count"] == 0:
ann_counts.loc[i, "prevalence"] = 0
else:
ann_counts.loc[i, "prevalence"] = examples.shape[0] / row["example_count"]

# round to three decimal places
ann_counts["prevalence"] = ann_counts["prevalence"].round(3)
logger.info("Computing annotation prevalence")

# Compute affected counts for all rows at once
ann_counts["prevalence"] = ann_counts.apply(
lambda row: (
(
(example_index["dataset"] == row["dataset"])
& (example_index["split"] == row["split"])
& (example_index["setup_id"] == row["setup_id"])
& (example_index[f"cat_{row['annotation_type']}"] > 0)
).sum()
/ row["example_count"]
if row["example_count"] > 0
else 0
),
axis=1,
).round(3)

return ann_counts

Expand Down Expand Up @@ -257,15 +203,19 @@ def compute_extra_fields_stats(example_index):
def compute_statistics(app, campaign):
statistics = {}

annotation_index, example_index = load_annotations_for_campaign(campaign)
span_index = generate_span_index(app, campaign)
example_index = generate_example_index(app, campaign)

if not annotation_index.empty:
annotation_index = preprocess_annotations(annotation_index, campaign)
if not span_index.empty:
span_index = preprocess_annotations(span_index, campaign)

annotation_counts = compute_ann_counts(annotation_index)
annotation_counts = compute_ann_counts(span_index)
annotation_counts = compute_avg_ann_counts(annotation_counts, example_index)
annotation_counts = compute_prevalence(annotation_counts, example_index)

# replace NaNs with 0
annotation_counts = annotation_counts.fillna(0.0)

statistics["ann_counts"] = {
"full": annotation_counts.to_dict(orient="records"),
"span": aggregate_ann_counts(annotation_counts, "span"),
Expand Down Expand Up @@ -383,14 +333,14 @@ def compute_span_counts(example_index, annotator_count, combinations, cat_column
return dataset_level_counts, example_level_counts


def prepare_example_index(combinations, selected_campaigns, campaigns):
def prepare_example_index(app, combinations, selected_campaigns, campaigns):
# gather a list of all examples with some annotations
example_index = pd.DataFrame()

for campaign_id in selected_campaigns:
campaign = campaigns[campaign_id]

_, ei = load_annotations_for_campaign(campaign)
ei = generate_example_index(app, campaign)
example_index = pd.concat([example_index, ei], ignore_index=True)

# a combination is a tuple (dataset, split, setup_id)
Expand Down Expand Up @@ -429,7 +379,7 @@ def compute_inter_annotator_agreement(app, selected_campaigns, combinations, cam
combinations = [(c["dataset"], c["split"], c["setup_id"]) for c in combinations]

example_index, annotator_count, annotator_group_ids, cat_columns = prepare_example_index(
combinations=combinations, selected_campaigns=selected_campaigns, campaigns=campaigns
app, combinations=combinations, selected_campaigns=selected_campaigns, campaigns=campaigns
)

dataset_level_counts, example_level_counts = compute_span_counts(
Expand Down

0 comments on commit be4a857

Please sign in to comment.