diff --git a/extractors/errors.go b/extractors/errors.go index 9b57c107c..8759033bb 100644 --- a/extractors/errors.go +++ b/extractors/errors.go @@ -9,4 +9,5 @@ var ( ErrURLParseFailed = errors.New("url parse failed") ErrInvalidRegularExpression = errors.New("invalid regular expression") ErrURLQueryParamsParseFailed = errors.New("url query params parse failed") + ErrBodyParseFailed = errors.New("body parse failed") ) diff --git a/extractors/instagram/instagram.go b/extractors/instagram/instagram.go index 7c7f2902a..3477726de 100644 --- a/extractors/instagram/instagram.go +++ b/extractors/instagram/instagram.go @@ -3,13 +3,12 @@ package instagram import ( "encoding/json" netURL "net/url" - "path" "strings" "github.com/pkg/errors" + "golang.org/x/net/html" "github.com/iawia002/lux/extractors" - "github.com/iawia002/lux/parser" "github.com/iawia002/lux/request" "github.com/iawia002/lux/utils" ) @@ -18,18 +17,30 @@ func init() { extractors.Register("instagram", New()) } -type instagram struct { - ShortcodeMedia struct { - EdgeSidecar struct { - Edges []struct { - Node struct { - DisplayURL string `json:"display_url"` - IsVideo bool `json:"is_video"` - VideoURL string `json:"video_url"` - } `json:"node"` - } `json:"edges"` - } `json:"edge_sidecar_to_children"` - } `json:"shortcode_media"` +type instagramPayload struct { + ArticleBody string `json:"articleBody"` + Author struct { + Image string `json:"image"` + Name string `json:"name"` + AlternativeName string `json:"alternativeName"` + Url string `json:"url"` + } `json:"author"` + Videos []struct { + UploadData string `json:"string"` + Description string `json:"description"` + Name string `json:"name"` + Caption string `json:"caption"` + Height string `json:"height"` + Width string `json:"width"` + ContentURL string `json:"contentUrl"` + ThumbnailURL string `json:"thumbnailUrl"` + } `json:"video"` + Images []struct { + Caption string `json:"caption"` + Height string `json:"height"` + Width string `json:"width"` + URL string `json:"url"` + } `json:"image"` } type extractor struct{} @@ -39,104 +50,65 @@ func New() extractors.Extractor { return &extractor{} } -func extractImageFromPage(html, url string) (map[string]*extractors.Stream, error) { - _, realURLs, err := parser.GetImages(html, "EmbeddedMediaImage", nil) +// Extract is the main function to extract the data. +func (e *extractor) Extract(url string, option extractors.Options) ([]*extractors.Data, error) { + u, err := netURL.Parse(url) if err != nil { return nil, errors.WithStack(err) } - urls := make([]*extractors.Part, 0, len(realURLs)) - var totalSize int64 - for _, realURL := range realURLs { - size, err := request.Size(realURL, url) - if err != nil { - return nil, errors.WithStack(err) - } - urlData := &extractors.Part{ - URL: realURL, - Size: size, - Ext: "jpg", - } - urls = append(urls, urlData) - totalSize += size + htmlResp, err := request.Get(u.String(), url, nil) + if err != nil { + return nil, errors.WithStack(err) } - return map[string]*extractors.Stream{ - "default": { - Parts: urls, - Size: totalSize, - }, - }, nil -} + reader := strings.NewReader(htmlResp) + htmlRoot, err := html.Parse(reader) + if err != nil { + return nil, errors.WithStack(err) + } -func extractFromData(dataString, url string) (map[string]*extractors.Stream, error) { - var data instagram - if err := json.Unmarshal([]byte(dataString), &data); err != nil { + sNode, err := dfsFindScript(htmlRoot) + if err != nil { + return nil, errors.WithStack(err) + } + + var payload instagramPayload + if err = json.Unmarshal([]byte(sNode.Data), &payload); err != nil { return nil, errors.WithStack(err) } - urls := make([]*extractors.Part, 0, len(data.ShortcodeMedia.EdgeSidecar.Edges)) var totalSize int64 - for _, u := range data.ShortcodeMedia.EdgeSidecar.Edges { - // Image - realURL := u.Node.DisplayURL - ext := "jpg" - if u.Node.IsVideo { - // Video - realURL = u.Node.VideoURL - ext = "mp4" + var parts []*extractors.Part + if len(payload.Videos) > 0 { + videoParts, err := createPartVideos(&payload, url) + if err != nil { + return nil, errors.WithStack(extractors.ErrBodyParseFailed) } - size, err := request.Size(realURL, url) + parts = append(parts, videoParts...) + } + if len(payload.Images) > 0 { + imageParts, err := createPartImages(&payload, url) if err != nil { - return nil, errors.WithStack(err) - } - urlData := &extractors.Part{ - URL: realURL, - Size: size, - Ext: ext, + return nil, errors.WithStack(extractors.ErrBodyParseFailed) } - urls = append(urls, urlData) - totalSize += size + + parts = append(parts, imageParts...) + } + + for _, part := range parts { + totalSize += part.Size } - return map[string]*extractors.Stream{ + streams := map[string]*extractors.Stream{ "default": { - Parts: urls, + Parts: parts, Size: totalSize, }, - }, nil -} - -// Extract is the main function to extract the data. -func (e *extractor) Extract(url string, option extractors.Options) ([]*extractors.Data, error) { - // Instagram is forcing a login to access the page, so we use the embed page to bypass that. - u, err := netURL.Parse(url) - if err != nil { - return nil, errors.WithStack(err) } - id := u.Path[strings.LastIndex(u.Path, "/")+1:] - u.Path = path.Join(u.Path, "embed") - html, err := request.Get(u.String(), url, nil) - if err != nil { - return nil, errors.WithStack(err) - } - dataStrings := utils.MatchOneOf(html, `window\.__additionalDataLoaded\('graphql',(.*)\);`) - if dataStrings == nil || len(dataStrings) < 2 { - return nil, errors.WithStack(extractors.ErrURLParseFailed) - } - dataString := dataStrings[1] - - var streams map[string]*extractors.Stream - if dataString == "" || dataString == "null" { - streams, err = extractImageFromPage(html, url) - } else { - streams, err = extractFromData(dataString, url) - } - if err != nil { - return nil, errors.WithStack(err) - } + id := u.Path[strings.LastIndex(u.Path, "/")+1:] return []*extractors.Data{ { @@ -148,3 +120,65 @@ func (e *extractor) Extract(url string, option extractors.Options) ([]*extractor }, }, nil } + +func dfsFindScript(n *html.Node) (*html.Node, error) { + if n.Type == html.ElementNode && n.Data == "script" { + for _, attr := range n.Attr { + if attr.Key == "type" && attr.Val == "application/ld+json" { + return n.FirstChild, nil + } + } + } + + for c := n.FirstChild; c != nil; c = c.NextSibling { + if ret, err := dfsFindScript(c); err == nil { + return ret, nil + } + } + + return nil, errors.WithStack(extractors.ErrBodyParseFailed) +} + +func createPartVideos(payload *instagramPayload, ref string) (parts []*extractors.Part, err error) { + for _, it := range payload.Videos { + _, ext, err := utils.GetNameAndExt(it.ContentURL) + if err != nil { + return parts, errors.WithStack(err) + } + filesize, err := request.Size(it.ContentURL, ref) + if err != nil { + return parts, errors.WithStack(err) + } + + part := &extractors.Part{ + URL: it.ContentURL, + Size: filesize, + Ext: ext, + } + parts = append(parts, part) + } + + return parts, err +} + +func createPartImages(payload *instagramPayload, ref string) (parts []*extractors.Part, err error) { + for _, it := range payload.Images { + _, ext, err := utils.GetNameAndExt(it.URL) + if err != nil { + return parts, errors.WithStack(err) + } + filesize, err := request.Size(it.URL, ref) + if err != nil { + return parts, errors.WithStack(err) + } + + part := &extractors.Part{ + URL: it.URL, + Size: filesize, + Ext: ext, + } + parts = append(parts, part) + } + + return parts, err +} diff --git a/extractors/instagram/instagram_test.go b/extractors/instagram/instagram_test.go index 0bd4b9bde..7105ed068 100644 --- a/extractors/instagram/instagram_test.go +++ b/extractors/instagram/instagram_test.go @@ -17,7 +17,7 @@ func TestDownload(t *testing.T) { args: test.Args{ URL: "https://www.instagram.com/p/BlIka1ZFCNr", Title: "Instagram BlIka1ZFCNr", - Size: 3003662, + Size: 577298, }, }, { @@ -25,7 +25,7 @@ func TestDownload(t *testing.T) { args: test.Args{ URL: "https://www.instagram.com/p/Bl5oVUyl9Yx", Title: "Instagram Bl5oVUyl9Yx", - Size: 250596, + Size: 101611, }, }, { @@ -33,7 +33,7 @@ func TestDownload(t *testing.T) { args: test.Args{ URL: "https://www.instagram.com/p/Bjyr-gxF4Rb", Title: "Instagram Bjyr-gxF4Rb", - Size: 4599909, + Size: 241466, }, }, } diff --git a/go.mod b/go.mod index 971aa3d61..f28d0b6c6 100644 --- a/go.mod +++ b/go.mod @@ -16,6 +16,7 @@ require ( github.com/pkg/errors v0.9.1 github.com/robertkrimen/otto v0.0.0-20211024170158-b87d35c0b86f github.com/urfave/cli/v2 v2.6.0 + golang.org/x/net v0.7.0 ) require ( @@ -37,7 +38,6 @@ require ( github.com/rogpeppe/go-internal v1.9.0 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect golang.org/x/exp v0.0.0-20220518171630-0b5c67f07fdf // indirect - golang.org/x/net v0.7.0 // indirect golang.org/x/sys v0.5.0 // indirect golang.org/x/text v0.7.0 // indirect gopkg.in/sourcemap.v1 v1.0.5 // indirect