Skip to content

Commit

Permalink
Multiple changes that don't affect final note statuses (split scorer …
Browse files Browse the repository at this point in the history
…into smaller/faster chunks, and version/type-safety improvements).

Multiple changes that don't affect final note statuses

Splitting the main scorer into smaller chunks:
1. Enable scoring note subsets in final_note_scoring (this is much faster than rescoring all notes, if you are using frozen rater parameters).
2. Split final_scoring into separately runnable final_note_scoring and contributor_scoring jobs.

Version upgrading and type-safety improvements:
3. Add pandas type-checking util for type-safe merges, joins, and concats.
4. Migrate to pandas 2.2.2 and Python 3.10.

Note: contributions are blended together from multiple authors, esp. @bradmiller.
  • Loading branch information
jbaxter committed Jun 27, 2024
1 parent a12d407 commit 926c97f
Show file tree
Hide file tree
Showing 21 changed files with 1,404 additions and 212 deletions.
98 changes: 66 additions & 32 deletions sourcecode/scoring/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@
tagPercentileForNormalization = 40
intervalHalfWidth = 0.3

# Max flip rates
prescoringAllUnlockedNotesMaxCrhChurn = 0.04
finalUnlockedNotesWithNoNewRatingsMaxCrhChurn = 0.03
finalNotesWithNewRatingsMaxCrhChurn = 0.40

# Data Filenames
scoredNotesOutputPath = "scoredNotes.tsv"
enrollmentInputPath = "userEnrollment-00000.tsv"
Expand All @@ -51,6 +56,7 @@
modelingPopulationKey = "modelingPopulation"
modelingGroupKey = "modelingGroup"
numberOfTimesEarnedOutKey = "numberOfTimesEarnedOut"
defaultIndexKey = "index"

# TSV Values
notHelpfulValueTsv = "NOT_HELPFUL"
Expand Down Expand Up @@ -237,7 +243,7 @@ def rater_factor_key(i):
(1, "helpfulUnbiasedLanguage"),
]
helpfulTagsTSVOrder = [tag for (tiebreakOrder, tag) in helpfulTagsAndTieBreakOrder]
helpfulTagsAndTypesTSVOrder = [(tag, np.int64) for tag in helpfulTagsTSVOrder]
helpfulTagsAndTypesTSVOrder = [(tag, np.int8) for tag in helpfulTagsTSVOrder]
helpfulTagsTiebreakOrder = [tag for (tiebreakOrder, tag) in sorted(helpfulTagsAndTieBreakOrder)]

# NOTE: Always add new tags to the end of this list, and *never* change the order of
Expand Down Expand Up @@ -275,7 +281,7 @@ def rater_factor_key(i):
(6, notHelpfulNoteNotNeededKey),
]
notHelpfulTagsTSVOrder = [tag for (tiebreakOrder, tag) in notHelpfulTagsAndTieBreakOrder]
notHelpfulTagsAndTypesTSVOrder = [(tag, np.int64) for tag in notHelpfulTagsTSVOrder]
notHelpfulTagsAndTypesTSVOrder = [(tag, np.int8) for tag in notHelpfulTagsTSVOrder]
notHelpfulTagsTiebreakOrder = [
tag for (tiebreakOrder, tag) in sorted(notHelpfulTagsAndTieBreakOrder)
]
Expand All @@ -287,10 +293,16 @@ def rater_factor_key(i):
}
adjustedSuffix = "Adjusted"
notHelpfulTagsAdjustedColumns = [f"{column}{adjustedSuffix}" for column in notHelpfulTagsTSVOrder]
notHelpfulTagsAdjustedTSVColumnsAndTypes = [
(tag, np.double) for tag in notHelpfulTagsAdjustedColumns
]
ratioSuffix = "Ratio"
notHelpfulTagsAdjustedRatioColumns = [
f"{column}{ratioSuffix}" for column in notHelpfulTagsAdjustedColumns
]
notHelpfulTagsAdjustedRatioTSVColumnsAndTypes = [
(tag, np.double) for tag in notHelpfulTagsAdjustedRatioColumns
]
ratingWeightKey = "ratingWeight"

incorrectTagRatingsMadeByRaterKey = "incorrectTagRatingsMadeByRater"
Expand Down Expand Up @@ -325,13 +337,14 @@ def rater_factor_key(i):
lowDiligenceRaterInterceptRound2Key = "lowDiligenceRaterInterceptRound2"
internalRaterInterceptRound2Key = "internalRaterInterceptRound2"

incorrectFilterColumns = [
notHelpfulIncorrectIntervalKey,
sumOfIncorrectTagRateByRaterIntervalKey,
numVotersIntervalKey,
noteTfIdfIncorrectScoreIntervalKey,
lowDiligenceLegacyNoteInterceptKey,
incorrectFilterColumnsAndTypes = [
(notHelpfulIncorrectIntervalKey, np.double),
(sumOfIncorrectTagRateByRaterIntervalKey, np.double),
(numVotersIntervalKey, np.double),
(noteTfIdfIncorrectScoreIntervalKey, np.double),
(lowDiligenceLegacyNoteInterceptKey, np.double),
]
incorrectFilterColumns = [col for (col, _) in incorrectFilterColumnsAndTypes]

misleadingTags = [
"misleadingOther",
Expand Down Expand Up @@ -386,7 +399,7 @@ def rater_factor_key(i):
(disagreeKey, np.int64),
(helpfulKey, np.int64),
(notHelpfulKey, np.int64),
(helpfulnessLevelKey, object),
(helpfulnessLevelKey, "category"),
]
+ helpfulTagsAndTypesTSVOrder
+ notHelpfulTagsAndTypesTSVOrder
Expand Down Expand Up @@ -429,7 +442,7 @@ def rater_factor_key(i):
(currentExpansionStatusKey, object),
(currentGroupStatusKey, object),
(currentDecidedByKey, object),
(currentModelingGroupKey, object),
(currentModelingGroupKey, np.double), # TODO: int
]
noteStatusHistoryTSVColumns = [col for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes]
noteStatusHistoryTSVTypes = [dtype for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes]
Expand All @@ -450,6 +463,7 @@ def rater_factor_key(i):
earnedOutNoAcknowledge = "earnedOutNoAcknowledge"
earnedOutAcknowledged = "earnedOutAcknowledged"
newUser = "newUser"
removed = "removed"
isAtRiskCRNHCount = 2
ratingImpactForEarnIn = 5
ratingImpact = "ratingImpact"
Expand All @@ -459,6 +473,7 @@ def rater_factor_key(i):
earnedOutNoAcknowledge: 2,
earnedOutAcknowledged: 3,
newUser: 4,
removed: 5,
}
emergingWriterDays = 28
isEmergingWriterKey = "isEmergingWriter"
Expand Down Expand Up @@ -522,25 +537,29 @@ def rater_factor_key(i):
col: dtype for (col, dtype) in noteParameterUncertaintyTSVColumnsAndTypes
}

auxiliaryScoredNotesTSVColumns = (
auxiliaryScoredNotesTSVColumnsAndTypes = (
[
noteIdKey,
ratingWeightKey,
createdAtMillisKey,
noteAuthorParticipantIdKey,
awaitingMoreRatingsBoolKey,
numRatingsLast28DaysKey,
currentLabelKey,
currentlyRatedHelpfulBoolKey,
currentlyRatedNotHelpfulBoolKey,
unlockedRatingStatusKey,
(noteIdKey, np.int64),
(ratingWeightKey, np.double),
(createdAtMillisKey, np.int64),
(noteAuthorParticipantIdKey, object),
(awaitingMoreRatingsBoolKey, np.int8),
(numRatingsLast28DaysKey, np.int64),
(currentLabelKey, str),
(currentlyRatedHelpfulBoolKey, np.int8),
(currentlyRatedNotHelpfulBoolKey, np.int8),
(unlockedRatingStatusKey, str),
]
+ helpfulTagsTSVOrder
+ notHelpfulTagsTSVOrder
+ notHelpfulTagsAdjustedColumns
+ notHelpfulTagsAdjustedRatioColumns
+ incorrectFilterColumns
+ helpfulTagsAndTypesTSVOrder
+ notHelpfulTagsAndTypesTSVOrder
+ notHelpfulTagsAdjustedTSVColumnsAndTypes
+ notHelpfulTagsAdjustedRatioTSVColumnsAndTypes
+ incorrectFilterColumnsAndTypes
)
auxiliaryScoredNotesTSVColumns = [col for (col, dtype) in auxiliaryScoredNotesTSVColumnsAndTypes]
auxiliaryScoredNotesTSVTypeMapping = {
col: dtype for (col, dtype) in auxiliaryScoredNotesTSVColumnsAndTypes
}

deprecatedNoteModelOutputColumns = frozenset(
{
Expand Down Expand Up @@ -610,7 +629,7 @@ def rater_factor_key(i):
(topicNoteFactor1Key, np.double),
(topicRatingStatusKey, str),
(noteTopicKey, str),
(topicNoteConfidentKey, str),
(topicNoteConfidentKey, pd.BooleanDtype()),
(expansionInternalActiveRulesKey, str),
(expansionPlusInternalActiveRulesKey, str),
(groupInternalActiveRulesKey, str),
Expand Down Expand Up @@ -638,10 +657,7 @@ def rater_factor_key(i):
(crhCrnhRatioDifferenceKey, np.double),
(meanNoteScoreKey, np.double),
(raterAgreeRatioKey, np.double),
(
aboveHelpfulnessThresholdKey,
"boolean",
), # nullable bool https://pandas.pydata.org/docs/user_guide/boolean.html
(aboveHelpfulnessThresholdKey, pd.BooleanDtype()),
(scorerNameKey, str),
(internalRaterReputationKey, np.double),
(lowDiligenceRaterInterceptKey, np.double),
Expand Down Expand Up @@ -681,7 +697,7 @@ def rater_factor_key(i):
(successfulRatingNeededToEarnIn, pd.Int64Dtype()),
(authorTopNotHelpfulTagValues, str),
(timestampOfLastStateChange, np.double),
(aboveHelpfulnessThresholdKey, np.float64), # nullable bool
(aboveHelpfulnessThresholdKey, np.float64), # nullable bool.
(isEmergingWriterKey, pd.BooleanDtype()),
(aggregateRatingReceivedTotal, pd.Int64Dtype()),
(timestampOfLastEarnOut, np.double),
Expand Down Expand Up @@ -731,6 +747,17 @@ def rater_factor_key(i):
col: dtype for (col, dtype) in noteStatusChangeTSVColumnsAndTypes
}

datasetKeyKey = "datasetKey"
partitionToReadKey = "partitionToRead"
fileNameToReadKey = "fileNameToRead"
inputPathsTSVColumnsAndTypes = [
(datasetKeyKey, str),
(partitionToReadKey, str),
(fileNameToReadKey, str),
]
inputPathsTSVColumns = [col for (col, _) in inputPathsTSVColumnsAndTypes]
inputPathsTSVTypeMapping = {col: dtype for (col, dtype) in inputPathsTSVColumnsAndTypes}


@contextmanager
def time_block(label):
Expand Down Expand Up @@ -830,3 +857,10 @@ class ModelResult:
auxiliaryNoteInfo: pd.DataFrame
scorerName: Optional[str]
metaScores: Optional[PrescoringMetaScorerOutput]


@dataclass
class NoteSubset:
noteSet: Optional[set]
maxCrhChurnRate: float
description: str
71 changes: 59 additions & 12 deletions sourcecode/scoring/contributor_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ def should_earn_in(contributorScoresWithEnrollment: pd.DataFrame):
authorEnrollmentCounts (pd.DataFrame): Scored Notes + User Enrollment status
"""
return (
(contributorScoresWithEnrollment[c.enrollmentState] != c.earnedIn)
(contributorScoresWithEnrollment[c.enrollmentState] != c.removed)
& (contributorScoresWithEnrollment[c.enrollmentState] != c.earnedIn)
& (contributorScoresWithEnrollment[c.enrollmentState] != c.atRisk)
& (
contributorScoresWithEnrollment[c.ratingImpact]
Expand All @@ -36,7 +37,8 @@ def newly_at_risk(authorEnrollmentCounts: pd.DataFrame):
authorEnrollmentCounts (pd.DataFrame): Scored Notes + User Enrollment status
"""
return (
(authorEnrollmentCounts[c.enrollmentState] != c.newUser)
(authorEnrollmentCounts[c.enrollmentState] != c.removed)
& (authorEnrollmentCounts[c.enrollmentState] != c.newUser)
& (authorEnrollmentCounts[c.enrollmentState] != c.earnedOutNoAcknowledge)
& (authorEnrollmentCounts[c.enrollmentState] != c.earnedOutAcknowledged)
& (authorEnrollmentCounts[c.enrollmentState] != c.atRisk)
Expand All @@ -55,7 +57,8 @@ def is_earned_out(authorEnrollmentCounts: pd.DataFrame):
authorEnrollmentCounts (pd.DataFrame): Scored Notes + User Enrollment status
"""
return (
(authorEnrollmentCounts[c.enrollmentState] != c.newUser)
(authorEnrollmentCounts[c.enrollmentState] != c.removed)
& (authorEnrollmentCounts[c.enrollmentState] != c.newUser)
& (authorEnrollmentCounts[c.enrollmentState] != c.earnedOutAcknowledged)
& (authorEnrollmentCounts[c.notesCurrentlyRatedNotHelpful] > c.isAtRiskCRNHCount)
)
Expand All @@ -71,7 +74,8 @@ def newly_earned_in(authorEnrollmentCounts):
authorEnrollmentCounts (pd.DataFrame): Scored Notes + User Enrollment status
"""
return (
(authorEnrollmentCounts[c.enrollmentState] != c.newUser)
(authorEnrollmentCounts[c.enrollmentState] != c.removed)
& (authorEnrollmentCounts[c.enrollmentState] != c.newUser)
& (authorEnrollmentCounts[c.enrollmentState] != c.earnedOutAcknowledged)
& (authorEnrollmentCounts[c.enrollmentState] != c.earnedOutNoAcknowledge)
& (authorEnrollmentCounts[c.enrollmentState] != c.earnedIn)
Expand Down Expand Up @@ -165,17 +169,25 @@ def _get_visible_rating_counts(
ratingCounts = validRatings.groupby(c.raterParticipantIdKey).sum()[ratingCountRows]

ratingsWithScores = get_ratings_with_scores(ratings, noteStatusHistory, scoredNotes)

historyCounts = ratingsWithScores.groupby(c.raterParticipantIdKey).sum()[
[c.awaitingMoreRatingsBoolKey]
]
historyCounts[c.ratingsAwaitingMoreRatings] = historyCounts[c.awaitingMoreRatingsBoolKey]
ratedAfterDecision = _get_rated_after_decision(ratings, noteStatusHistory)
historyCounts = historyCounts.merge(ratedAfterDecision, on=c.raterParticipantIdKey, how="left")
historyCounts = historyCounts.merge(
ratedAfterDecision,
on=c.raterParticipantIdKey,
how="left",
unsafeAllowed=c.ratedAfterDecision,
)
# Fill in zero for any rater who didn't rate any notes after status was assigned and consequently
# doesn't appear in the dataframe.
historyCounts = historyCounts.fillna({c.ratedAfterDecision: 0})

ratingCounts = ratingCounts.merge(historyCounts, on=c.raterParticipantIdKey, how="outer")
ratingCounts = ratingCounts.merge(
historyCounts, on=c.raterParticipantIdKey, how="outer", unsafeAllowed=set(ratingCountRows)
)
for rowName in ratingCountRows:
ratingCounts[rowName] = ratingCounts[rowName].fillna(0)
return ratingCounts
Expand Down Expand Up @@ -310,7 +322,7 @@ def is_emerging_writer(scoredNotes: pd.DataFrame):
"""
authorCounts = author_helpfulness(scoredNotes, c.coreNoteInterceptKey)
raterCounts = scoredNotes.groupby(c.noteAuthorParticipantIdKey).sum(numeric_only=True)[
c.numRatingsLast28DaysKey
[c.numRatingsLast28DaysKey]
]
emergingWriter = (
authorCounts.join(raterCounts, how="outer", lsuffix="_author", rsuffix="_rater")
Expand Down Expand Up @@ -349,6 +361,7 @@ def single_trigger_earn_out(contributorScoresWithEnrollment: pd.DataFrame) -> pd
!= c.enrollmentStateToThrift[c.earnedOutAcknowledged]
)
& (contributorScoresWithEnrollment[c.enrollmentState] != c.enrollmentStateToThrift[c.newUser])
& (contributorScoresWithEnrollment[c.enrollmentState] != c.enrollmentStateToThrift[c.removed])
)

contributorScoresWithEnrollment.loc[earnedOutUsers, c.numberOfTimesEarnedOutKey] = (
Expand Down Expand Up @@ -412,8 +425,8 @@ def get_contributor_state(
) -> pd.DataFrame:
"""
Given scored notes, ratings, note status history, the current user enrollment state, this
uses the contributor counts over ratings and notes and transitions the user between the different
enrollment states.
uses the contributor counts over ratings and notes and transitions the user between the
different enrollment states. If current user enrollment state is removed, do not change.
Args:
scoredNotes (pd.DataFrame): scored notes
Expand All @@ -434,7 +447,11 @@ def get_contributor_state(
# We need to consider only the last 5 notes for enrollment state. The ratings are aggregated historically.
# For users who have earned out, we should only consider notes written since the earn out event
scoredNotesWithLastEarnOut = scoredNotes.merge(
userEnrollment, left_on=c.noteAuthorParticipantIdKey, right_on=c.participantIdKey, how="left"
userEnrollment[[c.participantIdKey, c.timestampOfLastEarnOut]],
left_on=c.noteAuthorParticipantIdKey,
right_on=c.participantIdKey,
how="left",
unsafeAllowed=c.timestampOfLastEarnOut,
)
# For users who don't appear in the userEnrollment file, set their timeStampOfLastEarnOut to default
scoredNotesWithLastEarnOut[c.timestampOfLastEarnOut].fillna(1, inplace=True)
Expand Down Expand Up @@ -462,6 +479,7 @@ def get_contributor_state(
left_on=c.raterParticipantIdKey,
right_on=c.noteAuthorParticipantIdKey,
how="outer",
unsafeAllowed=c.hasCrnhSinceEarnOut,
).drop(columns=[c.noteAuthorParticipantIdKey])

with c.time_block("Contributor State: Emerging Writers"):
Expand All @@ -472,12 +490,23 @@ def get_contributor_state(
left_on=c.raterParticipantIdKey,
right_on=c.noteAuthorParticipantIdKey,
how="outer",
unsafeAllowed=c.isEmergingWriterKey,
).drop(columns=[c.noteAuthorParticipantIdKey])

with c.time_block("Contributor State: Combining"):
# We merge the current enrollment state
contributorScoresWithEnrollment = contributorScores.merge(
userEnrollment, left_on=c.raterParticipantIdKey, right_on=c.participantIdKey, how="outer"
userEnrollment,
left_on=c.raterParticipantIdKey,
right_on=c.participantIdKey,
how="outer",
unsafeAllowed={
c.successfulRatingNeededToEarnIn,
c.timestampOfLastStateChange,
c.numberOfTimesEarnedOutKey,
"coreBool",
"expansionBool",
},
)

# We set the new contributor state.
Expand Down Expand Up @@ -608,7 +637,25 @@ def get_contributor_scores(
scoredNotes, lastNNotes, countNMRNotesLast, sinceLastEarnOut
)
contributorCounts = (
visibleRatingCounts.join(visibleNoteCounts, lsuffix="note", rsuffix="rater", how="outer")
visibleRatingCounts.join(
visibleNoteCounts,
lsuffix="note",
rsuffix="rater",
how="outer",
unsafeAllowed={
c.defaultIndexKey,
c.awaitingMoreRatingsBoolKey + "note",
c.ratingsAwaitingMoreRatings,
c.currentlyRatedHelpfulBoolKey,
c.currentlyRatedNotHelpfulBoolKey,
c.awaitingMoreRatingsBoolKey + "rater",
c.notesCurrentlyRatedHelpful,
c.notesCurrentlyRatedNotHelpful,
c.notesAwaitingMoreRatings,
c.numRatingsKey,
c.aggregateRatingReceivedTotal,
},
)
.reset_index()
.rename({"index": c.raterParticipantIdKey}, axis=1)[
[
Expand Down
Loading

0 comments on commit 926c97f

Please sign in to comment.