Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python log parser - handle limited size log content #213

Merged
merged 3 commits into from
Nov 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 92 additions & 48 deletions utils/pythonutils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ const (
Pip PythonTool = "pip"
Pipenv PythonTool = "pipenv"
Poetry PythonTool = "poetry"

startDownloadingPattern = `^\s*Downloading\s`
downloadingCaptureGroup = `[^\s]*`
startUsingCachedPattern = `^\s*Using\scached\s`
usingCacheCaptureGroup = `[\S]+`
endPattern = `\s\(`
)

type PythonTool string
Expand Down Expand Up @@ -152,6 +158,50 @@ func getFilePath(srcPath, fileName string) (string, error) {
return filePath, nil
}

// Create the CmdOutputPattern objects that can capture group content that may span multiple lines for logs that have line size limitations.
// Since the log parser parse line by line, we need to create a parser that can capture group content that may span multiple lines.
func getMultilineSplitCaptureOutputPattern(startCollectingPattern, captureGroup, endCollectingPattern string, handler func(pattern *gofrogcmd.CmdOutputPattern) (string, error)) (parsers []*gofrogcmd.CmdOutputPattern) {
// Prepare regex patterns.
oneLineRegex := regexp.MustCompile(startCollectingPattern + `(` + captureGroup + `)` + endCollectingPattern)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

regexp.MustCompile is heavy.
Let's make the effort to compile regex patterns on compilation time by putting them in variables outside the functions.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its dynamic patterns that are constructed by the given arguments to the method. I don't see any benefit to refactoring it outside because I need those values at the method

startCollectionRegexp := regexp.MustCompile(startCollectingPattern)
endCollectionRegexp := regexp.MustCompile(endCollectingPattern)

// Create a parser for single line pattern matches.
parsers = append(parsers, &gofrogcmd.CmdOutputPattern{RegExp: oneLineRegex, ExecFunc: handler})

// Create a parser for multi line pattern matches.
lineBuffer := ""
collectingMultiLineValue := false
parsers = append(parsers, &gofrogcmd.CmdOutputPattern{RegExp: regexp.MustCompile(".*"), ExecFunc: func(pattern *gofrogcmd.CmdOutputPattern) (string, error) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.* does not include newline characters. Is it intended?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the parser that we use goes over the content line by line so It will not include newlines anyway since it is the delimiter used for tokens.
The purpose of this parser is to remove any newlines and concatenate the text removing new lines that was added if split

// Check if the line matches the startCollectingPattern.
if !collectingMultiLineValue && startCollectionRegexp.MatchString(pattern.Line) {
// Start collecting lines.
collectingMultiLineValue = true
lineBuffer = pattern.Line
// We assume that the content is multiline so no need to check end at this point.
// Single line will be handled and matched by the other parser.
return pattern.Line, nil
}
if !collectingMultiLineValue {
return pattern.Line, nil
}
// Add the line content to the buffer.
lineBuffer += pattern.Line
// Check if the line matches the endCollectingPattern.
if endCollectionRegexp.MatchString(pattern.Line) {
collectingMultiLineValue = false
// Simulate a one line content check to make sure we have regex match.
if oneLineRegex.MatchString(lineBuffer) {
return handler(&gofrogcmd.CmdOutputPattern{Line: pattern.Line, MatchedResults: oneLineRegex.FindStringSubmatch(lineBuffer)})
}
}

return pattern.Line, nil
}})

return
}

func InstallWithLogParsing(tool PythonTool, commandArgs []string, log utils.Log, srcPath string) (map[string]entities.Dependency, error) {
if tool == Pipenv {
// Add verbosity flag to pipenv commands to collect necessary data
Expand All @@ -161,19 +211,14 @@ func InstallWithLogParsing(tool PythonTool, commandArgs []string, log utils.Log,
installCmd.Dir = srcPath

dependenciesMap := map[string]entities.Dependency{}

// Create regular expressions for log parsing.
collectingRegexp := regexp.MustCompile(`^Collecting\s(\w[\w-.]+)`)
downloadingRegexp := regexp.MustCompile(`^\s*Downloading\s([^\s]*)\s\(`)
usingCachedRegexp := regexp.MustCompile(`^\s*Using\scached\s([\S]+)\s\(`)
alreadySatisfiedRegexp := regexp.MustCompile(`^Requirement\salready\ssatisfied:\s(\w[\w-.]+)`)
parsers := []*gofrogcmd.CmdOutputPattern{}

var packageName string
expectingPackageFilePath := false

// Extract downloaded package name.
dependencyNameParser := gofrogcmd.CmdOutputPattern{
RegExp: collectingRegexp,
parsers = append(parsers, &gofrogcmd.CmdOutputPattern{
RegExp: regexp.MustCompile(`^Collecting\s(\w[\w-.]+)`),
ExecFunc: func(pattern *gofrogcmd.CmdOutputPattern) (string, error) {
// If this pattern matched a second time before downloaded-file-name was found, prompt a message.
if expectingPackageFilePath {
Expand All @@ -186,7 +231,7 @@ func InstallWithLogParsing(tool PythonTool, commandArgs []string, log utils.Log,
}

// Check for out of bound results.
if len(pattern.MatchedResults)-1 < 0 {
if len(pattern.MatchedResults)-1 <= 0 {
log.Debug(fmt.Sprintf("Failed extracting package name from line: %s", pattern.Line))
return pattern.Line, nil
}
Expand All @@ -197,49 +242,34 @@ func InstallWithLogParsing(tool PythonTool, commandArgs []string, log utils.Log,

return pattern.Line, nil
},
}

// Extract downloaded file, stored in Artifactory.
downloadedFileParser := gofrogcmd.CmdOutputPattern{
RegExp: downloadingRegexp,
ExecFunc: func(pattern *gofrogcmd.CmdOutputPattern) (string, error) {
// Check for out of bound results.
if len(pattern.MatchedResults)-1 < 0 {
log.Debug(fmt.Sprintf("Failed extracting download path from line: %s", pattern.Line))
return pattern.Line, nil
}

// If this pattern matched before package-name was found, do not collect this path.
if !expectingPackageFilePath {
log.Debug(fmt.Sprintf("Could not resolve package name for download path: %s , continuing...", packageName))
return pattern.Line, nil
}
})

// Save dependency information.
filePath := pattern.MatchedResults[1]
lastSlashIndex := strings.LastIndex(filePath, "/")
var fileName string
if lastSlashIndex == -1 {
fileName = filePath
} else {
fileName = filePath[lastSlashIndex+1:]
}
dependenciesMap[strings.ToLower(packageName)] = entities.Dependency{Id: fileName}
expectingPackageFilePath = false

log.Debug(fmt.Sprintf("Found package: %s installed with: %s", packageName, fileName))
saveCaptureGroupAsDependencyInfo := func(pattern *gofrogcmd.CmdOutputPattern) (string, error) {
fileName := extractFileNameFromRegexCaptureGroup(pattern)
if fileName == "" {
log.Debug(fmt.Sprintf("Failed extracting download path from line: %s", pattern.Line))
return pattern.Line, nil
},
}
// If this pattern matched before package-name was found, do not collect this path.
if !expectingPackageFilePath {
log.Debug(fmt.Sprintf("Could not resolve package name for download path: %s , continuing...", packageName))
return pattern.Line, nil
}
// Save dependency information.
dependenciesMap[strings.ToLower(packageName)] = entities.Dependency{Id: fileName}
expectingPackageFilePath = false
log.Debug(fmt.Sprintf("Found package: %s installed with: %s", packageName, fileName))
return pattern.Line, nil
}

cachedFileParser := gofrogcmd.CmdOutputPattern{
RegExp: usingCachedRegexp,
ExecFunc: downloadedFileParser.ExecFunc,
}
// Extract downloaded file, stored in Artifactory. (value at log may be split into multiple lines)
parsers = append(parsers, getMultilineSplitCaptureOutputPattern(startDownloadingPattern, downloadingCaptureGroup, endPattern, saveCaptureGroupAsDependencyInfo)...)
// Extract cached file, stored in Artifactory. (value at log may be split into multiple lines)
parsers = append(parsers, getMultilineSplitCaptureOutputPattern(startUsingCachedPattern, usingCacheCaptureGroup, endPattern, saveCaptureGroupAsDependencyInfo)...)

// Extract already installed packages names.
installedPackagesParser := gofrogcmd.CmdOutputPattern{
RegExp: alreadySatisfiedRegexp,
parsers = append(parsers, &gofrogcmd.CmdOutputPattern{
RegExp: regexp.MustCompile(`^Requirement\salready\ssatisfied:\s(\w[\w-.]+)`),
ExecFunc: func(pattern *gofrogcmd.CmdOutputPattern) (string, error) {
// Check for out of bound results.
if len(pattern.MatchedResults)-1 < 0 {
Expand All @@ -252,12 +282,26 @@ func InstallWithLogParsing(tool PythonTool, commandArgs []string, log utils.Log,
log.Debug(fmt.Sprintf("Found package: %s already installed", pattern.MatchedResults[1]))
return pattern.Line, nil
},
}
})

// Execute command.
_, errorOut, _, err := gofrogcmd.RunCmdWithOutputParser(installCmd, true, &dependencyNameParser, &downloadedFileParser, &cachedFileParser, &installedPackagesParser)
_, errorOut, _, err := gofrogcmd.RunCmdWithOutputParser(installCmd, true, parsers...)
if err != nil {
return nil, fmt.Errorf("failed running %s command with error: '%s - %s'", string(tool), err.Error(), errorOut)
}
return dependenciesMap, nil
}

func extractFileNameFromRegexCaptureGroup(pattern *gofrogcmd.CmdOutputPattern) (fileName string) {
// Check for out of bound results (no captures).
if len(pattern.MatchedResults) <= 1 {
return ""
}
// Extract file information from capture group.
filePath := pattern.MatchedResults[1]
lastSlashIndex := strings.LastIndex(filePath, "/")
if lastSlashIndex == -1 {
return filePath
}
return filePath[lastSlashIndex+1:]
}
127 changes: 127 additions & 0 deletions utils/pythonutils/utils_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
package pythonutils

import (
"fmt"
"strings"
"testing"

gofrogcmd "github.com/jfrog/gofrog/io"
"github.com/stretchr/testify/assert"
)

func TestGetMultilineCaptureOutputPattern(t *testing.T) {
tests := []struct {
name string
text string
startCapturePattern string
captureGroupPattern string
endCapturePattern string
expectedCapture string
}{
{
name: "Using cached - single line captures",
startCapturePattern: startUsingCachedPattern,
captureGroupPattern: usingCacheCaptureGroup,
endCapturePattern: endPattern,
text: `
Looking in indexes:
***localhost:8081/artifactory/api/pypi/cli-pipenv-pypi-virtual-1698829624/simple

Collecting pexpect==4.8.0 (from -r /tmp/pipenv-qzun2hd3-requirements/pipenv-o_899oue-hashed-reqs.txt (line 1))

Using cached http://localhost:8081/artifactory/api/pypi/cli-pipenv-pypi-virtual-1698829624/packages/packages/39/7b/88dbb785881c28a102619d46423cb853b46dbccc70d3ac362d99773a78ce/pexpect-4.8.0-py2.py3-none-any.whl (59 kB)`,
expectedCapture: `pexpect-4.8.0-py2.py3-none-any.whl`,
},
{
name: "Using cached - multi line captures",
startCapturePattern: startUsingCachedPattern,
captureGroupPattern: usingCacheCaptureGroup,
endCapturePattern: endPattern,
text: `
Looking in indexes:
***localhost:8081/artifactory/api/pypi/cli-pipenv-pypi-virtual-16
98829624/simple

Collecting pexpect==4.8.0 (from -r
/tmp/pipenv-qzun2hd3-requirements/pipenv-o_899oue-hashed-reqs.txt (line 1))

Using cached
http://localhost:8081/artifactory/api/pypi/cli-pipenv-pypi-virtual-1698829624/pa
ckages/packages/39/7b/88dbb785881c28a102619d46423cb853b46dbccc70d3ac362d99773a78
ce/pexpect-4.8.0-py2.py3-none-any.whl (59 kB)`,
expectedCapture: `pexpect-4.8.0-py2.py3-none-any.whl`,
},
{
name: "Downloading - single line captures",
startCapturePattern: startDownloadingPattern,
captureGroupPattern: downloadingCaptureGroup,
endCapturePattern: endPattern,
text: ` Preparing metadata (pyproject.toml): finished with status 'done'
Collecting PyYAML==5.1.2 (from jfrog-python-example==1.0)
Downloading http://localhost:8081/artifactory/api/pypi/cli-pypi-virtual-1698829558/packages/packages/e3/e8/b3212641ee2718d556df0f23f78de8303f068fe29cdaa7a91018849582fe/PyYAML-5.1.2.tar.gz (265 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 265.0/265.0 kB 364.4 MB/s eta 0:00:00
Installing build dependencies: started`,
expectedCapture: `PyYAML-5.1.2.tar.gz`,
},
{
name: "Downloading - multi line captures",
startCapturePattern: startDownloadingPattern,
captureGroupPattern: downloadingCaptureGroup,
endCapturePattern: endPattern,
text: ` Preparing metadata (pyproject.toml): finished with status 'done'
Collecting PyYAML==5.1.2 (from jfrog-python-example==1.0)
Downloading http://localhost:8081/artifactory/api/pypi/cli-pypi-virtual-1698
829558/packages/packages/e3/e8/b3212641ee2718d556df0f23f78de8303f068fe29cdaa7a91018849
582fe/PyYAML-5.1.2.tar.gz (265 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 265.0/265.0 kB 364.4 MB/s eta 0:00:00
Installing build dependencies: started`,
expectedCapture: `PyYAML-5.1.2.tar.gz`,
},
}

for _, testCase := range tests {
t.Run(testCase.name, func(t *testing.T) {
aggFunc, captures := getCapturesFromTest(testCase.expectedCapture)
runDummyTextStream(t, testCase.text, getMultilineSplitCaptureOutputPattern(
testCase.startCapturePattern,
testCase.captureGroupPattern,
testCase.endCapturePattern,
aggFunc,
))
assert.Len(t, (*captures), 1, fmt.Sprintf("Expected 1 captured group, got size: %d", len(*captures)))
assert.Equal(t, testCase.expectedCapture, (*captures)[0], fmt.Sprintf("Expected capture group: %s, got: %s", testCase.expectedCapture, (*captures)[0]))
})
}
}

func getCapturesFromTest(expectedCaptures ...string) (func(pattern *gofrogcmd.CmdOutputPattern) (string, error), *[]string) {
captures := []string{}
aggFunc := func(pattern *gofrogcmd.CmdOutputPattern) (string, error) {
captured := extractFileNameFromRegexCaptureGroup(pattern)
for _, expectedCapture := range expectedCaptures {
if expectedCapture == captured {
captures = append(captures, expectedCapture)
}
}
return pattern.Line, nil
}
return aggFunc, &captures
}

func runDummyTextStream(t *testing.T, txt string, parsers []*gofrogcmd.CmdOutputPattern) {
// tokenize the text to be represented line by line to simulate expected cmd log output
lines := strings.Split(txt, "\n")
// iterate over the lines to simulate line text stream
for _, line := range lines {
for _, parser := range parsers {
// check if the line matches the regexp of the parser
if parser.RegExp.MatchString(line) {
parser.MatchedResults = parser.RegExp.FindStringSubmatch(line)
parser.Line = line
// execute the parser function
_, scannerError := parser.ExecFunc(parser)
assert.NoError(t, scannerError)
}
}
}
}
Loading