Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1228 fix instagram download #1252

Merged
merged 4 commits into from
Jul 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions extractors/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ var (
ErrURLParseFailed = errors.New("url parse failed")
ErrInvalidRegularExpression = errors.New("invalid regular expression")
ErrURLQueryParamsParseFailed = errors.New("url query params parse failed")
ErrBodyParseFailed = errors.New("body parse failed")
)
214 changes: 124 additions & 90 deletions extractors/instagram/instagram.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,12 @@ package instagram
import (
"encoding/json"
netURL "net/url"
"path"
"strings"

"github.com/pkg/errors"
"golang.org/x/net/html"

"github.com/iawia002/lux/extractors"
"github.com/iawia002/lux/parser"
"github.com/iawia002/lux/request"
"github.com/iawia002/lux/utils"
)
Expand All @@ -18,18 +17,30 @@ func init() {
extractors.Register("instagram", New())
}

type instagram struct {
ShortcodeMedia struct {
EdgeSidecar struct {
Edges []struct {
Node struct {
DisplayURL string `json:"display_url"`
IsVideo bool `json:"is_video"`
VideoURL string `json:"video_url"`
} `json:"node"`
} `json:"edges"`
} `json:"edge_sidecar_to_children"`
} `json:"shortcode_media"`
type instagramPayload struct {
ArticleBody string `json:"articleBody"`
Author struct {
Image string `json:"image"`
Name string `json:"name"`
AlternativeName string `json:"alternativeName"`
Url string `json:"url"`
} `json:"author"`
Videos []struct {
UploadData string `json:"string"`
Description string `json:"description"`
Name string `json:"name"`
Caption string `json:"caption"`
Height string `json:"height"`
Width string `json:"width"`
ContentURL string `json:"contentUrl"`
ThumbnailURL string `json:"thumbnailUrl"`
} `json:"video"`
Images []struct {
Caption string `json:"caption"`
Height string `json:"height"`
Width string `json:"width"`
URL string `json:"url"`
} `json:"image"`
}

type extractor struct{}
Expand All @@ -39,104 +50,65 @@ func New() extractors.Extractor {
return &extractor{}
}

func extractImageFromPage(html, url string) (map[string]*extractors.Stream, error) {
_, realURLs, err := parser.GetImages(html, "EmbeddedMediaImage", nil)
// Extract is the main function to extract the data.
func (e *extractor) Extract(url string, option extractors.Options) ([]*extractors.Data, error) {
u, err := netURL.Parse(url)
if err != nil {
return nil, errors.WithStack(err)
}

urls := make([]*extractors.Part, 0, len(realURLs))
var totalSize int64
for _, realURL := range realURLs {
size, err := request.Size(realURL, url)
if err != nil {
return nil, errors.WithStack(err)
}
urlData := &extractors.Part{
URL: realURL,
Size: size,
Ext: "jpg",
}
urls = append(urls, urlData)
totalSize += size
htmlResp, err := request.Get(u.String(), url, nil)
if err != nil {
return nil, errors.WithStack(err)
}

return map[string]*extractors.Stream{
"default": {
Parts: urls,
Size: totalSize,
},
}, nil
}
reader := strings.NewReader(htmlResp)
htmlRoot, err := html.Parse(reader)
if err != nil {
return nil, errors.WithStack(err)
}

func extractFromData(dataString, url string) (map[string]*extractors.Stream, error) {
var data instagram
if err := json.Unmarshal([]byte(dataString), &data); err != nil {
sNode, err := dfsFindScript(htmlRoot)
if err != nil {
return nil, errors.WithStack(err)
}

var payload instagramPayload
if err = json.Unmarshal([]byte(sNode.Data), &payload); err != nil {
return nil, errors.WithStack(err)
}

urls := make([]*extractors.Part, 0, len(data.ShortcodeMedia.EdgeSidecar.Edges))
var totalSize int64
for _, u := range data.ShortcodeMedia.EdgeSidecar.Edges {
// Image
realURL := u.Node.DisplayURL
ext := "jpg"
if u.Node.IsVideo {
// Video
realURL = u.Node.VideoURL
ext = "mp4"
var parts []*extractors.Part
if len(payload.Videos) > 0 {
videoParts, err := createPartVideos(&payload, url)
if err != nil {
return nil, errors.WithStack(extractors.ErrBodyParseFailed)
}

size, err := request.Size(realURL, url)
parts = append(parts, videoParts...)
}
if len(payload.Images) > 0 {
imageParts, err := createPartImages(&payload, url)
if err != nil {
return nil, errors.WithStack(err)
}
urlData := &extractors.Part{
URL: realURL,
Size: size,
Ext: ext,
return nil, errors.WithStack(extractors.ErrBodyParseFailed)
}
urls = append(urls, urlData)
totalSize += size

parts = append(parts, imageParts...)
}

for _, part := range parts {
totalSize += part.Size
}

return map[string]*extractors.Stream{
streams := map[string]*extractors.Stream{
"default": {
Parts: urls,
Parts: parts,
Size: totalSize,
},
}, nil
}

// Extract is the main function to extract the data.
func (e *extractor) Extract(url string, option extractors.Options) ([]*extractors.Data, error) {
// Instagram is forcing a login to access the page, so we use the embed page to bypass that.
u, err := netURL.Parse(url)
if err != nil {
return nil, errors.WithStack(err)
}
id := u.Path[strings.LastIndex(u.Path, "/")+1:]
u.Path = path.Join(u.Path, "embed")

html, err := request.Get(u.String(), url, nil)
if err != nil {
return nil, errors.WithStack(err)
}
dataStrings := utils.MatchOneOf(html, `window\.__additionalDataLoaded\('graphql',(.*)\);`)
if dataStrings == nil || len(dataStrings) < 2 {
return nil, errors.WithStack(extractors.ErrURLParseFailed)
}
dataString := dataStrings[1]

var streams map[string]*extractors.Stream
if dataString == "" || dataString == "null" {
streams, err = extractImageFromPage(html, url)
} else {
streams, err = extractFromData(dataString, url)
}
if err != nil {
return nil, errors.WithStack(err)
}
id := u.Path[strings.LastIndex(u.Path, "/")+1:]

return []*extractors.Data{
{
Expand All @@ -148,3 +120,65 @@ func (e *extractor) Extract(url string, option extractors.Options) ([]*extractor
},
}, nil
}

func dfsFindScript(n *html.Node) (*html.Node, error) {
if n.Type == html.ElementNode && n.Data == "script" {
for _, attr := range n.Attr {
if attr.Key == "type" && attr.Val == "application/ld+json" {
return n.FirstChild, nil
}
}
}

for c := n.FirstChild; c != nil; c = c.NextSibling {
if ret, err := dfsFindScript(c); err == nil {
return ret, nil
}
}

return nil, errors.WithStack(extractors.ErrBodyParseFailed)
}

func createPartVideos(payload *instagramPayload, ref string) (parts []*extractors.Part, err error) {
for _, it := range payload.Videos {
_, ext, err := utils.GetNameAndExt(it.ContentURL)
if err != nil {
return parts, errors.WithStack(err)
}
filesize, err := request.Size(it.ContentURL, ref)
if err != nil {
return parts, errors.WithStack(err)
}

part := &extractors.Part{
URL: it.ContentURL,
Size: filesize,
Ext: ext,
}
parts = append(parts, part)
}

return parts, err
}

func createPartImages(payload *instagramPayload, ref string) (parts []*extractors.Part, err error) {
for _, it := range payload.Images {
_, ext, err := utils.GetNameAndExt(it.URL)
if err != nil {
return parts, errors.WithStack(err)
}
filesize, err := request.Size(it.URL, ref)
if err != nil {
return parts, errors.WithStack(err)
}

part := &extractors.Part{
URL: it.URL,
Size: filesize,
Ext: ext,
}
parts = append(parts, part)
}

return parts, err
}
6 changes: 3 additions & 3 deletions extractors/instagram/instagram_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,23 @@ func TestDownload(t *testing.T) {
args: test.Args{
URL: "https://www.instagram.com/p/BlIka1ZFCNr",
Title: "Instagram BlIka1ZFCNr",
Size: 3003662,
Size: 577298,
},
},
{
name: "image test",
args: test.Args{
URL: "https://www.instagram.com/p/Bl5oVUyl9Yx",
Title: "Instagram Bl5oVUyl9Yx",
Size: 250596,
Size: 101611,
},
},
{
name: "image album test",
args: test.Args{
URL: "https://www.instagram.com/p/Bjyr-gxF4Rb",
Title: "Instagram Bjyr-gxF4Rb",
Size: 4599909,
Size: 241466,
},
},
}
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ require (
github.com/pkg/errors v0.9.1
github.com/robertkrimen/otto v0.0.0-20211024170158-b87d35c0b86f
github.com/urfave/cli/v2 v2.6.0
golang.org/x/net v0.7.0
)

require (
Expand All @@ -37,7 +38,6 @@ require (
github.com/rogpeppe/go-internal v1.9.0 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
golang.org/x/exp v0.0.0-20220518171630-0b5c67f07fdf // indirect
golang.org/x/net v0.7.0 // indirect
golang.org/x/sys v0.5.0 // indirect
golang.org/x/text v0.7.0 // indirect
gopkg.in/sourcemap.v1 v1.0.5 // indirect
Expand Down