Skip to content

Commit

Permalink
Flag to let scip encoder infer language based on suffix
Browse files Browse the repository at this point in the history
Summary:
SCIP is a multi language schema, and the language of a symbol is associated with the file the definition is contained in.
In some indexers, this is set in the scip data, and for older indexers that don't provide a scip.Language we can use --language to enforce a language for the entire indexing run.

However, for Java and Kotlin, this doesn't work, as they are intermingled in the same build ,and sometimes the same target.
So we have to fall bac.  to detecting by suffix in this case.

Language selection for symbols when converting SCIP to Glean is thus:
- if the scip.Language value is set, use it
- if --infer-language is set, attempt to use the file suffix
- otherwise use --language
- else UnknownLanguage

Reviewed By: simonmar

Differential Revision: D60156776

fbshipit-source-id: c0a94259294207c513ef123d2d7b79236f9cad37
  • Loading branch information
donsbot authored and facebook-github-bot committed Jul 24, 2024
1 parent 1a96455 commit 597a803
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 13 deletions.
35 changes: 28 additions & 7 deletions glean/lang/scip/Data/SCIP/Angle.hs
Original file line number Diff line number Diff line change
Expand Up @@ -112,31 +112,35 @@ getOrSetFact sym = do
--
scipToAngle
:: Maybe SCIP.LanguageId
-> Bool
-> Maybe FilePath
-> Maybe FilePath
-> B.ByteString
-> Aeson.Value
scipToAngle mlang mPathPrefix mStripPrefix scip = Aeson.Array $ V.fromList $
scipToAngle mlang inferLanguage mPathPrefix mStripPrefix scip =
Aeson.Array $ V.fromList $
SCIP.generateSCIPJSON (SCIP.insertPredicateMap HashMap.empty result)
where
(result,_) = runState
(runTranslate mlang mPathPrefix mStripPrefix scip) emptyState
(result,_) = runState (runTranslate mlang
inferLanguage mPathPrefix mStripPrefix scip) emptyState

-- | First pass, grab all the occurences with _role := Definition
-- build up symbol string -> fact id for all defs
runTranslate
:: Maybe SCIP.LanguageId
-> Bool
-> Maybe FilePath
-> Maybe FilePath
-> B.ByteString
-> Parse [SCIP.Predicate]
runTranslate mlang mPathPrefix mStripPrefix scip =
runTranslate mlang inferLanguage mPathPrefix mStripPrefix scip =
case Proto.decodeMessage scip of
Left err -> error err
Right (v :: Scip.Index) -> do
a <- decodeScipMetadata (v ^. Scip.metadata)
bs <- mapM
(decodeScipDoc mlang mPathPrefix mStripPrefix) (v ^. Scip.documents)
(decodeScipDoc mlang inferLanguage mPathPrefix mStripPrefix)
(v ^. Scip.documents)
return (a <> concat bs)

--
Expand All @@ -146,11 +150,12 @@ runTranslate mlang mPathPrefix mStripPrefix scip =
--
decodeScipDoc
:: Maybe SCIP.LanguageId
-> Bool
-> Maybe FilePath
-> Maybe FilePath
-> Scip.Document
-> Parse [SCIP.Predicate]
decodeScipDoc mlang mPathPrefix mStripPrefix doc = do
decodeScipDoc mlang inferLanguage mPathPrefix mStripPrefix doc = do
srcFileId <- nextId
let filepath0 = doc ^. Scip.relativePath
-- first, strip any matching prefix
Expand All @@ -169,9 +174,15 @@ decodeScipDoc mlang mPathPrefix mStripPrefix doc = do
let parseLang = SCIP.parseLanguage (doc ^. Scip.language)
langEnum = fromEnum $ case parseLang of
SCIP.UnknownLanguage
-- if --infer-language , look at the suffix
| inferLanguage
, Just langId <- fileLanguageOf filepath
-> langId
-- otherwise if --language, assume that's correct
| Just langId <- mlang -> langId -- use default if present
-- otherwise its really unknown
| otherwise -> SCIP.UnknownLanguage
x -> x
x -> x -- scip document provides the language
fileLang <- SCIP.predicateId "scip.FileLanguage" langFileId
[ "file" .= srcFileId
, "language" .= langEnum
Expand All @@ -180,6 +191,16 @@ decodeScipDoc mlang mPathPrefix mStripPrefix doc = do
infos <- mapM decodeScipInfo (doc ^. Scip.symbols)
return (srcFile : fileLang <> concat (occs <> infos))

-- We really don't want to do a general purpose language detector
-- but rely on the indexer knowing things. For the Java/Kotlin case,
-- files are frequently intermingled in the same build so we can't
-- decide a priori which language is being indexed
fileLanguageOf :: Text -> Maybe SCIP.LanguageId
fileLanguageOf filepath
| "kt" `Text.isSuffixOf` filepath = Just SCIP.Kotlin
| "java" `Text.isSuffixOf` filepath = Just SCIP.Java
| otherwise = Nothing

decodeScipInfo :: Scip.SymbolInformation -> Parse [SCIP.Predicate]
decodeScipInfo info = do
(docIds, docFacts) <- unzip <$> forM scipDocs (\docStr -> do
Expand Down
7 changes: 4 additions & 3 deletions glean/lang/scip/Glean/SCIP/Driver.hs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ runIndexer params@ScipIndexerParams{..} = do
when scipWritesLocal $ do
copyFile (repoDir </> "index.scip") scipFile
removeFile (repoDir </> "index.scip")
processSCIP scipLanguage Nothing Nothing scipFile
processSCIP scipLanguage False Nothing Nothing scipFile

-- | Run a SCIP indexer on a repository, put scip dump output into outputFile
runSCIPIndexer :: ScipIndexerParams -> FilePath -> IO ()
Expand All @@ -69,10 +69,11 @@ runSCIPIndexer ScipIndexerParams{..} outputFile =
-- | Convert an scip protobufs encoded file into Glean lsif.angle JSON object
processSCIP
:: Maybe LanguageId
-> Bool
-> Maybe FilePath
-> Maybe FilePath
-> FilePath
-> IO Aeson.Value
processSCIP mlang mPathPrefix mStripPrefix scipFile = do
processSCIP mlang inferLanguage mPathPrefix mStripPrefix scipFile = do
logInfo $ "Using SCIP from " <> scipFile
scipToAngle mlang mPathPrefix mStripPrefix <$> B.readFile scipFile
scipToAngle mlang inferLanguage mPathPrefix mStripPrefix <$> B.readFile scipFile
2 changes: 1 addition & 1 deletion glean/lang/scip/indexer/Glean/Indexer/SCIP.hs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ indexer = Indexer {
if mFile
then pure indexerRoot
else error "Neither --input nor --root are scip files"
val <- SCIP.processSCIP Nothing Nothing Nothing scipFile
val <- SCIP.processSCIP Nothing False Nothing Nothing scipFile
sendJsonBatches backend repo "scip" val
derive backend repo
}
Expand Down
10 changes: 8 additions & 2 deletions glean/lang/scip/indexer/Glean/Indexer/SCIP/Main.hs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ data SCIP = SCIP
{ scipFile :: FilePath -- ^ input file
, outputFile :: FilePath -- ^ output file
, scipLanguage :: Maybe LanguageId -- ^ a default language if known
, inferLanguage :: Bool -- ^ default False, infer language using file suffix
, scipPathPrefix :: Maybe FilePath -- ^ optional path to prefix file paths
, stripPathPrefix :: Maybe FilePath -- ^ optional prefix to drop from paths
}
Expand All @@ -41,6 +42,11 @@ options = do
metavar "LANGUAGE" <>
value Nothing <>
help "Default language of files in the index"
inferLanguage <- switch $
short 'i' <>
long "infer-language" <>
help ("Infer symbol language based on file suffix" <>
"(when set this takes precedence over --language)")
scipPathPrefix <- option (Just <$> str) $ long "root-prefix" <>
metavar "PATH" <>
value Nothing <>
Expand All @@ -49,7 +55,6 @@ options = do
metavar "PATH" <>
value Nothing <>
help "Path prefix to strip from path data"

return SCIP{..}

-- If the indexer doesn't set the langauge Id of the files, we
Expand All @@ -71,5 +76,6 @@ main :: IO ()
main = withOptions (info (helper <*> options) fullDesc) $ \SCIP{..} -> do
scipExists <- doesFileExist scipFile
when (not scipExists) $ error ("Could not find SCIP file at: " <> scipFile)
json <- SCIP.processSCIP scipLanguage scipPathPrefix stripPathPrefix scipFile
json <- SCIP.processSCIP scipLanguage inferLanguage scipPathPrefix
stripPathPrefix scipFile
Util.writeJSON outputFile json

0 comments on commit 597a803

Please sign in to comment.