package news import ( "fmt" "net/url" "regexp" "strings" "time" "github.com/PuerkitoBio/goquery" "github.com/mmcdole/gofeed" "1bet.fr/scraper/requests" "1bet.fr/scraper/utils" ) type Sport struct { Id int Name string CleanName string } type Source struct { Id int Sport *Sport Name string CleanName string FeedUrl string Error *string Trace *string } type News struct { Id int Source *Source LeagueId *int TeamId *int Title string CleanTitle string Link string PubDate *time.Time Description *string Image *string Teaser *string Author *string Content *[]string Redirect *string Haystack *string Tags *[]string CleanTags *[]string Error *string Trace *string } func (n *News) Feed() error { parsedLink, err := url.Parse(n.Link) if err != nil { return err } doc, err := requests.GetDocumentFromURL(n.Link) if err != nil { return err } switch parsedLink.Host { case utils.HostEurosport: n.Teaser = utils.StringPointer(doc.Find("h2").First().Text()) doc.Find(".article-body .article-s4-rs p").Each(func(i int, s *goquery.Selection) { n.Content = utils.ArrayPointerAppend(n.Content, s.Text()) }) n.Author = utils.StringPointer(doc.Find(".flex a.caption-s5-fx div.font-bold").Text()) doc.Find(".related-topics .atom-tag").Each(func(i int, s *goquery.Selection) { tag := strings.TrimSpace(s.Text()) cleanTag := utils.Sanitize(tag) if !utils.ArrayPointerContains(n.CleanTags, cleanTag) { n.Tags = utils.ArrayPointerAppend(n.Tags, tag) n.CleanTags = utils.ArrayPointerAppend(n.CleanTags, cleanTag) } }) case utils.HostRugbyrama: n.Teaser = utils.StringPointer(doc.Find("h2.storyfull__teaser").Text()) doc.Find(".storyfull__paragraphs p.storyfull__paragraph").Each(func(i int, s *goquery.Selection) { n.Content = utils.ArrayPointerAppend(n.Content, s.Text()) }) n.Author = utils.StringPointer(strings.Replace(doc.Find(".storyfull__publisher-author-name").Text(), "Par ", "", 1)) doc.Find(".storyfull__linkentities-infos a.storyfull__linkentities-name").Each(func(i int, s *goquery.Selection) { tag := strings.TrimSpace(s.Text()) cleanTag := utils.Sanitize(tag) if !utils.ArrayPointerContains(n.CleanTags, cleanTag) { n.Tags = utils.ArrayPointerAppend(n.Tags, tag) n.CleanTags = utils.ArrayPointerAppend(n.CleanTags, cleanTag) } }) case utils.HostLequipe: n.Teaser = utils.StringPointer(doc.Find("h2.Article__chapo").Text()) doc.Find(".Paragraph__content").Each(func(i int, s *goquery.Selection) { n.Content = utils.ArrayPointerAppend(n.Content, s.Text()) }) n.Author = utils.StringPointer(doc.Find(".Author__name").Text()) doc.Find(".RelatedLinks a.RelatedLinks__link").Each(func(i int, s *goquery.Selection) { tag := strings.TrimSpace(s.Text()) cleanTag := utils.Sanitize(tag) if !utils.ArrayPointerContains(n.CleanTags, cleanTag) { n.Tags = utils.ArrayPointerAppend(n.Tags, tag) n.CleanTags = utils.ArrayPointerAppend(n.CleanTags, cleanTag) } }) case utils.HostFFTT: n.Teaser = utils.StringPointer(doc.Find(".news-description p").First().Text()) doc.Find(".news-description p").Each(func(i int, s *goquery.Selection) { if i > 0 { n.Content = utils.ArrayPointerAppend(n.Content, s.Text()) } }) doc.Find(".social-shares-large-wrapper a.link").Each(func(i int, s *goquery.Selection) { tag := strings.TrimSpace(s.Text()) cleanTag := utils.Sanitize(tag) if !utils.ArrayPointerContains(n.CleanTags, cleanTag) { n.Tags = utils.ArrayPointerAppend(n.Tags, tag) n.CleanTags = utils.ArrayPointerAppend(n.CleanTags, cleanTag) } }) case utils.HostFootmercato: n.Teaser = utils.StringPointer(doc.Find("h2.article__lead").Text()) doc.Find(".article__content p").Each(func(i int, s *goquery.Selection) { n.Content = utils.ArrayPointerAppend(n.Content, s.Text()) }) n.Author = utils.StringPointer(doc.Find(".article__author a").Text()) default: n.Error = utils.StringPointer("unknown link host") n.Trace = utils.StringPointer("unknown link host : " + n.Link) } if n.Content == nil { n.Redirect = utils.StringPointer(n.Link) } if n.CleanTags == nil { n.Tags = utils.ArrayPointerAppend(n.Tags, n.Source.Sport.Name) n.CleanTags = utils.ArrayPointerAppend(n.CleanTags, n.Source.Sport.CleanName) } n.Haystack = utils.StringPointer(fmt.Sprintf("%s-%s", n.CleanTitle, utils.ArrayPointerJoin(n.CleanTags, "-"))) return nil } func (s *Source) ListNews() ([]*News, error) { var newsList []*News fp := gofeed.NewParser() feed, err := fp.ParseURL(s.FeedUrl) if err != nil { return nil, err } for _, item := range feed.Items { n := &News{ Source: s, Title: item.Title, Description: utils.StringPointer(regexp.MustCompile(`<[^>]*>`).ReplaceAllLiteralString(item.Description, "")), CleanTitle: utils.Sanitize(item.Title), PubDate: item.PublishedParsed, Link: item.Link, } for _, tags := range item.Categories { for _, tag := range strings.Split(tags, ",") { n.Tags = utils.ArrayPointerAppend(n.Tags, tag) n.CleanTags = utils.ArrayPointerAppend(n.CleanTags, utils.Sanitize(tag)) } } if item.Image != nil { n.Image = utils.StringPointer(item.Image.URL) } else if len(item.Enclosures) > 0 { n.Image = utils.StringPointer(item.Enclosures[0].URL) } else { doc, err := goquery.NewDocumentFromReader(strings.NewReader(item.Description)) if err == nil { if src, ok := doc.Find("img").Attr("src"); ok { n.Image = utils.StringPointer(src) } } } if item.Author != nil { n.Author = utils.StringPointer(item.Author.Name) } newsList = append(newsList, n) } return newsList, nil }