scraper/requests/requests.go

77 lines
1.6 KiB
Go
Raw Permalink Normal View History

2020-10-05 08:24:33 +00:00
package requests
import (
"fmt"
"log"
"net/http"
"net/url"
2020-10-19 09:26:23 +00:00
"github.com/PuerkitoBio/goquery"
"github.com/go-redis/redis"
2020-10-05 08:24:33 +00:00
"golang.org/x/net/proxy"
)
const (
2020-10-19 09:26:23 +00:00
redisAddr = "127.0.0.1:6379"
torAddr = "socks5://127.0.0.1:9050"
defaultAgent = "Mozilla/5.0 (X11; Linux x86_64…) Gecko/20100101 Firefox/68.0"
2020-10-05 08:24:33 +00:00
)
var cli *http.Client
2020-10-19 09:26:23 +00:00
var red *redis.Client
2020-10-05 08:24:33 +00:00
func init() {
proxyUrl, err := url.Parse(torAddr)
if err != nil {
log.Fatalf("error while parsing torAddr %s : %s", torAddr, err)
}
dialer, err := proxy.FromURL(proxyUrl, proxy.Direct)
if err != nil {
log.Fatalf("error while creating dialer : %s", err)
}
transport := &http.Transport{
Dial: dialer.Dial,
}
cli = &http.Client{
Transport: transport,
}
2020-10-19 09:26:23 +00:00
red = redis.NewClient(&redis.Options{
Addr: redisAddr,
})
if pong := red.Ping().Val(); pong != "PONG" {
log.Fatalf("unexpected response from redis PING conmmand : %s", pong)
}
2020-10-05 08:24:33 +00:00
}
func GetDocumentFromURL(url string) (*goquery.Document, error) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, fmt.Errorf("error while building request: %s", err)
}
2020-10-19 09:26:23 +00:00
agent := red.SRandMember("agents").Val()
if agent == "" {
agent = defaultAgent
}
req.Header.Set("User-Agent", agent)
2020-10-05 08:24:33 +00:00
resp, err := cli.Do(req)
if err != nil {
return nil, fmt.Errorf("error while sending request: %s", err)
}
defer func() {
if err := resp.Body.Close(); err != nil {
log.Fatalf("error while closing body for %s : %s", url, err)
}
}()
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return nil, fmt.Errorf("error while parsing response: %s", err)
}
return doc, nil
}