Multiple changes that don't affect final note statuses (split scorer …

…into smaller/faster chunks, and version/type-safety improvements). Multiple changes that don't affect final note statuses Splitting the main scorer into smaller chunks: 1. Enable scoring note subsets in final_note_scoring (this is much faster than rescoring all notes, if you are using frozen rater parameters). 2. Split final_scoring into separately runnable final_note_scoring and contributor_scoring jobs. Version upgrading and type-safety improvements: 3. Add pandas type-checking util for type-safe merges, joins, and concats. 4. Migrate to pandas 2.2.2 and Python 3.10. Note: contributions are blended together from multiple authors, esp. @bradmiller.
twitter · Jun 27, 2024 · 926c97f · 926c97f
1 parent a12d407
commit 926c97f
Show file tree

Hide file tree

Showing 21 changed files with 1,404 additions and 212 deletions.
diff --git a/sourcecode/scoring/constants.py b/sourcecode/scoring/constants.py
@@ -32,6 +32,11 @@
 tagPercentileForNormalization = 40
 intervalHalfWidth = 0.3
 
+# Max flip rates
+prescoringAllUnlockedNotesMaxCrhChurn = 0.04
+finalUnlockedNotesWithNoNewRatingsMaxCrhChurn = 0.03
+finalNotesWithNewRatingsMaxCrhChurn = 0.40
+
 # Data Filenames
 scoredNotesOutputPath = "scoredNotes.tsv"
 enrollmentInputPath = "userEnrollment-00000.tsv"
@@ -51,6 +56,7 @@
 modelingPopulationKey = "modelingPopulation"
 modelingGroupKey = "modelingGroup"
 numberOfTimesEarnedOutKey = "numberOfTimesEarnedOut"
+defaultIndexKey = "index"
 
 # TSV Values
 notHelpfulValueTsv = "NOT_HELPFUL"
@@ -237,7 +243,7 @@ def rater_factor_key(i):
   (1, "helpfulUnbiasedLanguage"),
 ]
 helpfulTagsTSVOrder = [tag for (tiebreakOrder, tag) in helpfulTagsAndTieBreakOrder]
-helpfulTagsAndTypesTSVOrder = [(tag, np.int64) for tag in helpfulTagsTSVOrder]
+helpfulTagsAndTypesTSVOrder = [(tag, np.int8) for tag in helpfulTagsTSVOrder]
 helpfulTagsTiebreakOrder = [tag for (tiebreakOrder, tag) in sorted(helpfulTagsAndTieBreakOrder)]
 
 # NOTE: Always add new tags to the end of this list, and *never* change the order of
@@ -275,7 +281,7 @@ def rater_factor_key(i):
   (6, notHelpfulNoteNotNeededKey),
 ]
 notHelpfulTagsTSVOrder = [tag for (tiebreakOrder, tag) in notHelpfulTagsAndTieBreakOrder]
-notHelpfulTagsAndTypesTSVOrder = [(tag, np.int64) for tag in notHelpfulTagsTSVOrder]
+notHelpfulTagsAndTypesTSVOrder = [(tag, np.int8) for tag in notHelpfulTagsTSVOrder]
 notHelpfulTagsTiebreakOrder = [
   tag for (tiebreakOrder, tag) in sorted(notHelpfulTagsAndTieBreakOrder)
 ]
@@ -287,10 +293,16 @@ def rater_factor_key(i):
 }
 adjustedSuffix = "Adjusted"
 notHelpfulTagsAdjustedColumns = [f"{column}{adjustedSuffix}" for column in notHelpfulTagsTSVOrder]
+notHelpfulTagsAdjustedTSVColumnsAndTypes = [
+  (tag, np.double) for tag in notHelpfulTagsAdjustedColumns
+]
 ratioSuffix = "Ratio"
 notHelpfulTagsAdjustedRatioColumns = [
   f"{column}{ratioSuffix}" for column in notHelpfulTagsAdjustedColumns
 ]
+notHelpfulTagsAdjustedRatioTSVColumnsAndTypes = [
+  (tag, np.double) for tag in notHelpfulTagsAdjustedRatioColumns
+]
 ratingWeightKey = "ratingWeight"
 
 incorrectTagRatingsMadeByRaterKey = "incorrectTagRatingsMadeByRater"
@@ -325,13 +337,14 @@ def rater_factor_key(i):
 lowDiligenceRaterInterceptRound2Key = "lowDiligenceRaterInterceptRound2"
 internalRaterInterceptRound2Key = "internalRaterInterceptRound2"
 
-incorrectFilterColumns = [
-  notHelpfulIncorrectIntervalKey,
-  sumOfIncorrectTagRateByRaterIntervalKey,
-  numVotersIntervalKey,
-  noteTfIdfIncorrectScoreIntervalKey,
-  lowDiligenceLegacyNoteInterceptKey,
+incorrectFilterColumnsAndTypes = [
+  (notHelpfulIncorrectIntervalKey, np.double),
+  (sumOfIncorrectTagRateByRaterIntervalKey, np.double),
+  (numVotersIntervalKey, np.double),
+  (noteTfIdfIncorrectScoreIntervalKey, np.double),
+  (lowDiligenceLegacyNoteInterceptKey, np.double),
 ]
+incorrectFilterColumns = [col for (col, _) in incorrectFilterColumnsAndTypes]
 
 misleadingTags = [
   "misleadingOther",
@@ -386,7 +399,7 @@ def rater_factor_key(i):
     (disagreeKey, np.int64),
     (helpfulKey, np.int64),
     (notHelpfulKey, np.int64),
-    (helpfulnessLevelKey, object),
+    (helpfulnessLevelKey, "category"),
   ]
   + helpfulTagsAndTypesTSVOrder
   + notHelpfulTagsAndTypesTSVOrder
@@ -429,7 +442,7 @@ def rater_factor_key(i):
   (currentExpansionStatusKey, object),
   (currentGroupStatusKey, object),
   (currentDecidedByKey, object),
-  (currentModelingGroupKey, object),
+  (currentModelingGroupKey, np.double),  # TODO: int
 ]
 noteStatusHistoryTSVColumns = [col for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes]
 noteStatusHistoryTSVTypes = [dtype for (col, dtype) in noteStatusHistoryTSVColumnsAndTypes]
@@ -450,6 +463,7 @@ def rater_factor_key(i):
 earnedOutNoAcknowledge = "earnedOutNoAcknowledge"
 earnedOutAcknowledged = "earnedOutAcknowledged"
 newUser = "newUser"
+removed = "removed"
 isAtRiskCRNHCount = 2
 ratingImpactForEarnIn = 5
 ratingImpact = "ratingImpact"
@@ -459,6 +473,7 @@ def rater_factor_key(i):
   earnedOutNoAcknowledge: 2,
   earnedOutAcknowledged: 3,
   newUser: 4,
+  removed: 5,
 }
 emergingWriterDays = 28
 isEmergingWriterKey = "isEmergingWriter"
@@ -522,25 +537,29 @@ def rater_factor_key(i):
   col: dtype for (col, dtype) in noteParameterUncertaintyTSVColumnsAndTypes
 }
 
-auxiliaryScoredNotesTSVColumns = (
+auxiliaryScoredNotesTSVColumnsAndTypes = (
   [
-    noteIdKey,
-    ratingWeightKey,
-    createdAtMillisKey,
-    noteAuthorParticipantIdKey,
-    awaitingMoreRatingsBoolKey,
-    numRatingsLast28DaysKey,
-    currentLabelKey,
-    currentlyRatedHelpfulBoolKey,
-    currentlyRatedNotHelpfulBoolKey,
-    unlockedRatingStatusKey,
+    (noteIdKey, np.int64),
+    (ratingWeightKey, np.double),
+    (createdAtMillisKey, np.int64),
+    (noteAuthorParticipantIdKey, object),
+    (awaitingMoreRatingsBoolKey, np.int8),
+    (numRatingsLast28DaysKey, np.int64),
+    (currentLabelKey, str),
+    (currentlyRatedHelpfulBoolKey, np.int8),
+    (currentlyRatedNotHelpfulBoolKey, np.int8),
+    (unlockedRatingStatusKey, str),
   ]
-  + helpfulTagsTSVOrder
-  + notHelpfulTagsTSVOrder
-  + notHelpfulTagsAdjustedColumns
-  + notHelpfulTagsAdjustedRatioColumns
-  + incorrectFilterColumns
+  + helpfulTagsAndTypesTSVOrder
+  + notHelpfulTagsAndTypesTSVOrder
+  + notHelpfulTagsAdjustedTSVColumnsAndTypes
+  + notHelpfulTagsAdjustedRatioTSVColumnsAndTypes
+  + incorrectFilterColumnsAndTypes
 )
+auxiliaryScoredNotesTSVColumns = [col for (col, dtype) in auxiliaryScoredNotesTSVColumnsAndTypes]
+auxiliaryScoredNotesTSVTypeMapping = {
+  col: dtype for (col, dtype) in auxiliaryScoredNotesTSVColumnsAndTypes
+}
 
 deprecatedNoteModelOutputColumns = frozenset(
   {
@@ -610,7 +629,7 @@ def rater_factor_key(i):
   (topicNoteFactor1Key, np.double),
   (topicRatingStatusKey, str),
   (noteTopicKey, str),
-  (topicNoteConfidentKey, str),
+  (topicNoteConfidentKey, pd.BooleanDtype()),
   (expansionInternalActiveRulesKey, str),
   (expansionPlusInternalActiveRulesKey, str),
   (groupInternalActiveRulesKey, str),
@@ -638,10 +657,7 @@ def rater_factor_key(i):
   (crhCrnhRatioDifferenceKey, np.double),
   (meanNoteScoreKey, np.double),
   (raterAgreeRatioKey, np.double),
-  (
-    aboveHelpfulnessThresholdKey,
-    "boolean",
-  ),  # nullable bool https://pandas.pydata.org/docs/user_guide/boolean.html
+  (aboveHelpfulnessThresholdKey, pd.BooleanDtype()),
   (scorerNameKey, str),
   (internalRaterReputationKey, np.double),
   (lowDiligenceRaterInterceptKey, np.double),
@@ -681,7 +697,7 @@ def rater_factor_key(i):
   (successfulRatingNeededToEarnIn, pd.Int64Dtype()),
   (authorTopNotHelpfulTagValues, str),
   (timestampOfLastStateChange, np.double),
-  (aboveHelpfulnessThresholdKey, np.float64),  # nullable bool
+  (aboveHelpfulnessThresholdKey, np.float64),  # nullable bool.
   (isEmergingWriterKey, pd.BooleanDtype()),
   (aggregateRatingReceivedTotal, pd.Int64Dtype()),
   (timestampOfLastEarnOut, np.double),
@@ -731,6 +747,17 @@ def rater_factor_key(i):
   col: dtype for (col, dtype) in noteStatusChangeTSVColumnsAndTypes
 }
 
+datasetKeyKey = "datasetKey"
+partitionToReadKey = "partitionToRead"
+fileNameToReadKey = "fileNameToRead"
+inputPathsTSVColumnsAndTypes = [
+  (datasetKeyKey, str),
+  (partitionToReadKey, str),
+  (fileNameToReadKey, str),
+]
+inputPathsTSVColumns = [col for (col, _) in inputPathsTSVColumnsAndTypes]
+inputPathsTSVTypeMapping = {col: dtype for (col, dtype) in inputPathsTSVColumnsAndTypes}
+
 
 @contextmanager
 def time_block(label):
@@ -830,3 +857,10 @@ class ModelResult:
   auxiliaryNoteInfo: pd.DataFrame
   scorerName: Optional[str]
   metaScores: Optional[PrescoringMetaScorerOutput]
+
+
+@dataclass
+class NoteSubset:
+  noteSet: Optional[set]
+  maxCrhChurnRate: float
+  description: str
diff --git a/sourcecode/scoring/contributor_state.py b/sourcecode/scoring/contributor_state.py
@@ -17,7 +17,8 @@ def should_earn_in(contributorScoresWithEnrollment: pd.DataFrame):
     authorEnrollmentCounts (pd.DataFrame): Scored Notes + User Enrollment status
   """
   return (
-    (contributorScoresWithEnrollment[c.enrollmentState] != c.earnedIn)
+    (contributorScoresWithEnrollment[c.enrollmentState] != c.removed)
+    & (contributorScoresWithEnrollment[c.enrollmentState] != c.earnedIn)
     & (contributorScoresWithEnrollment[c.enrollmentState] != c.atRisk)
     & (
       contributorScoresWithEnrollment[c.ratingImpact]
@@ -36,7 +37,8 @@ def newly_at_risk(authorEnrollmentCounts: pd.DataFrame):
     authorEnrollmentCounts (pd.DataFrame): Scored Notes + User Enrollment status
   """
   return (
-    (authorEnrollmentCounts[c.enrollmentState] != c.newUser)
+    (authorEnrollmentCounts[c.enrollmentState] != c.removed)
+    & (authorEnrollmentCounts[c.enrollmentState] != c.newUser)
     & (authorEnrollmentCounts[c.enrollmentState] != c.earnedOutNoAcknowledge)
     & (authorEnrollmentCounts[c.enrollmentState] != c.earnedOutAcknowledged)
     & (authorEnrollmentCounts[c.enrollmentState] != c.atRisk)
@@ -55,7 +57,8 @@ def is_earned_out(authorEnrollmentCounts: pd.DataFrame):
     authorEnrollmentCounts (pd.DataFrame): Scored Notes + User Enrollment status
   """
   return (
-    (authorEnrollmentCounts[c.enrollmentState] != c.newUser)
+    (authorEnrollmentCounts[c.enrollmentState] != c.removed)
+    & (authorEnrollmentCounts[c.enrollmentState] != c.newUser)
     & (authorEnrollmentCounts[c.enrollmentState] != c.earnedOutAcknowledged)
     & (authorEnrollmentCounts[c.notesCurrentlyRatedNotHelpful] > c.isAtRiskCRNHCount)
   )
@@ -71,7 +74,8 @@ def newly_earned_in(authorEnrollmentCounts):
     authorEnrollmentCounts (pd.DataFrame): Scored Notes + User Enrollment status
   """
   return (
-    (authorEnrollmentCounts[c.enrollmentState] != c.newUser)
+    (authorEnrollmentCounts[c.enrollmentState] != c.removed)
+    & (authorEnrollmentCounts[c.enrollmentState] != c.newUser)
     & (authorEnrollmentCounts[c.enrollmentState] != c.earnedOutAcknowledged)
     & (authorEnrollmentCounts[c.enrollmentState] != c.earnedOutNoAcknowledge)
     & (authorEnrollmentCounts[c.enrollmentState] != c.earnedIn)
@@ -165,17 +169,25 @@ def _get_visible_rating_counts(
   ratingCounts = validRatings.groupby(c.raterParticipantIdKey).sum()[ratingCountRows]
 
   ratingsWithScores = get_ratings_with_scores(ratings, noteStatusHistory, scoredNotes)
+
   historyCounts = ratingsWithScores.groupby(c.raterParticipantIdKey).sum()[
     [c.awaitingMoreRatingsBoolKey]
   ]
   historyCounts[c.ratingsAwaitingMoreRatings] = historyCounts[c.awaitingMoreRatingsBoolKey]
   ratedAfterDecision = _get_rated_after_decision(ratings, noteStatusHistory)
-  historyCounts = historyCounts.merge(ratedAfterDecision, on=c.raterParticipantIdKey, how="left")
+  historyCounts = historyCounts.merge(
+    ratedAfterDecision,
+    on=c.raterParticipantIdKey,
+    how="left",
+    unsafeAllowed=c.ratedAfterDecision,
+  )
   # Fill in zero for any rater who didn't rate any notes after status was assigned and consequently
   # doesn't appear in the dataframe.
   historyCounts = historyCounts.fillna({c.ratedAfterDecision: 0})
 
-  ratingCounts = ratingCounts.merge(historyCounts, on=c.raterParticipantIdKey, how="outer")
+  ratingCounts = ratingCounts.merge(
+    historyCounts, on=c.raterParticipantIdKey, how="outer", unsafeAllowed=set(ratingCountRows)
+  )
   for rowName in ratingCountRows:
     ratingCounts[rowName] = ratingCounts[rowName].fillna(0)
   return ratingCounts
@@ -310,7 +322,7 @@ def is_emerging_writer(scoredNotes: pd.DataFrame):
   """
   authorCounts = author_helpfulness(scoredNotes, c.coreNoteInterceptKey)
   raterCounts = scoredNotes.groupby(c.noteAuthorParticipantIdKey).sum(numeric_only=True)[
-    c.numRatingsLast28DaysKey
+    [c.numRatingsLast28DaysKey]
   ]
   emergingWriter = (
     authorCounts.join(raterCounts, how="outer", lsuffix="_author", rsuffix="_rater")
@@ -349,6 +361,7 @@ def single_trigger_earn_out(contributorScoresWithEnrollment: pd.DataFrame) -> pd
       != c.enrollmentStateToThrift[c.earnedOutAcknowledged]
     )
     & (contributorScoresWithEnrollment[c.enrollmentState] != c.enrollmentStateToThrift[c.newUser])
+    & (contributorScoresWithEnrollment[c.enrollmentState] != c.enrollmentStateToThrift[c.removed])
   )
 
   contributorScoresWithEnrollment.loc[earnedOutUsers, c.numberOfTimesEarnedOutKey] = (
@@ -412,8 +425,8 @@ def get_contributor_state(
 ) -> pd.DataFrame:
   """
   Given scored notes, ratings, note status history, the current user enrollment state, this
-  uses the contributor counts over ratings and notes and transitions the user between the different
-  enrollment states.
+  uses the contributor counts over ratings and notes and transitions the user between the
+  different enrollment states. If current user enrollment state is removed, do not change.
 
   Args:
       scoredNotes (pd.DataFrame): scored notes
@@ -434,7 +447,11 @@ def get_contributor_state(
     # We need to consider only the last 5 notes for enrollment state. The ratings are aggregated historically.
     # For users who have earned out, we should only consider notes written since the earn out event
     scoredNotesWithLastEarnOut = scoredNotes.merge(
-      userEnrollment, left_on=c.noteAuthorParticipantIdKey, right_on=c.participantIdKey, how="left"
+      userEnrollment[[c.participantIdKey, c.timestampOfLastEarnOut]],
+      left_on=c.noteAuthorParticipantIdKey,
+      right_on=c.participantIdKey,
+      how="left",
+      unsafeAllowed=c.timestampOfLastEarnOut,
     )
     # For users who don't appear in the userEnrollment file, set their timeStampOfLastEarnOut to default
     scoredNotesWithLastEarnOut[c.timestampOfLastEarnOut].fillna(1, inplace=True)
@@ -462,6 +479,7 @@ def get_contributor_state(
       left_on=c.raterParticipantIdKey,
       right_on=c.noteAuthorParticipantIdKey,
       how="outer",
+      unsafeAllowed=c.hasCrnhSinceEarnOut,
     ).drop(columns=[c.noteAuthorParticipantIdKey])
 
   with c.time_block("Contributor State: Emerging Writers"):
@@ -472,12 +490,23 @@ def get_contributor_state(
       left_on=c.raterParticipantIdKey,
       right_on=c.noteAuthorParticipantIdKey,
       how="outer",
+      unsafeAllowed=c.isEmergingWriterKey,
     ).drop(columns=[c.noteAuthorParticipantIdKey])
 
   with c.time_block("Contributor State: Combining"):
     # We merge the current enrollment state
     contributorScoresWithEnrollment = contributorScores.merge(
-      userEnrollment, left_on=c.raterParticipantIdKey, right_on=c.participantIdKey, how="outer"
+      userEnrollment,
+      left_on=c.raterParticipantIdKey,
+      right_on=c.participantIdKey,
+      how="outer",
+      unsafeAllowed={
+        c.successfulRatingNeededToEarnIn,
+        c.timestampOfLastStateChange,
+        c.numberOfTimesEarnedOutKey,
+        "coreBool",
+        "expansionBool",
+      },
     )
 
     # We set the new contributor state.
@@ -608,7 +637,25 @@ def get_contributor_scores(
     scoredNotes, lastNNotes, countNMRNotesLast, sinceLastEarnOut
   )
   contributorCounts = (
-    visibleRatingCounts.join(visibleNoteCounts, lsuffix="note", rsuffix="rater", how="outer")
+    visibleRatingCounts.join(
+      visibleNoteCounts,
+      lsuffix="note",
+      rsuffix="rater",
+      how="outer",
+      unsafeAllowed={
+        c.defaultIndexKey,
+        c.awaitingMoreRatingsBoolKey + "note",
+        c.ratingsAwaitingMoreRatings,
+        c.currentlyRatedHelpfulBoolKey,
+        c.currentlyRatedNotHelpfulBoolKey,
+        c.awaitingMoreRatingsBoolKey + "rater",
+        c.notesCurrentlyRatedHelpful,
+        c.notesCurrentlyRatedNotHelpful,
+        c.notesAwaitingMoreRatings,
+        c.numRatingsKey,
+        c.aggregateRatingReceivedTotal,
+      },
+    )
     .reset_index()
     .rename({"index": c.raterParticipantIdKey}, axis=1)[
       [