214 lines
4.9 KiB
Go
214 lines
4.9 KiB
Go
package sravni
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"log/slog"
|
|
"strings"
|
|
|
|
"git.loyso.art/frx/kurious/internal/domain"
|
|
|
|
"github.com/go-resty/resty/v2"
|
|
"golang.org/x/net/html"
|
|
"golang.org/x/net/html/atom"
|
|
)
|
|
|
|
const (
|
|
baseURL = "https://www.sravni.ru/kursy"
|
|
)
|
|
|
|
func NewClient(ctx context.Context, log *slog.Logger, debug bool) (c *client, err error) {
|
|
c = &client{
|
|
log: log.With(slog.String("client", "sravni")),
|
|
http: resty.New().
|
|
SetBaseURL(baseURL).
|
|
SetDebug(debug),
|
|
}
|
|
|
|
c.cachedMainPageInfo, err = c.getMainPageState(ctx)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return c, nil
|
|
}
|
|
|
|
type client struct {
|
|
log *slog.Logger
|
|
http *resty.Client
|
|
|
|
cachedMainPageInfo *PageState
|
|
}
|
|
|
|
type PageStateRuntimeConfig struct {
|
|
BrandingURL string `json:"brandingUrl"`
|
|
Release string `json:"release"`
|
|
Environment string `json:"environment"`
|
|
Gateway string `json:"gatewayUrl"`
|
|
APIGatewayURL string `json:"apiGatewayUrl"`
|
|
EducationURL string `json:"educationUrl"`
|
|
PhoneVerifierURL string `json:"phoneVerifierUrl"`
|
|
WebPath string `json:"webPath"`
|
|
ServiceName string `json:"serviceName"`
|
|
OrgnazationURL string `json:"organizationsUrl"`
|
|
}
|
|
|
|
type Link struct {
|
|
URL string `json:"url"`
|
|
Title string `json:"title"`
|
|
}
|
|
|
|
type ReduxStatePrefooterItem struct {
|
|
Title string `json:"title"`
|
|
Links []Link `json:"links"`
|
|
}
|
|
|
|
type ReduxMetadata struct {
|
|
Data struct {
|
|
Prefooter []ReduxStatePrefooterItem `json:"prefooter"`
|
|
} `json:"data"`
|
|
}
|
|
|
|
type InitialReduxState struct {
|
|
Metadata ReduxMetadata `json:"metadata"`
|
|
Categories struct {
|
|
Data map[string]int `json:"data"`
|
|
} `json:"categories"`
|
|
}
|
|
|
|
type PageStateProperties struct {
|
|
InitialReduxState InitialReduxState `json:"initialReduxState"`
|
|
}
|
|
|
|
type PageState struct {
|
|
Page string `json:"page"`
|
|
Query map[string]string `json:"query"`
|
|
BuildID string `json:"buildId"`
|
|
RuntimeConfig PageStateRuntimeConfig `json:"runtimeConfig"`
|
|
Props PageStateProperties `json:"props"`
|
|
}
|
|
|
|
func (p *PageState) Clone() *PageState {
|
|
copiedState := *p
|
|
copiedState.Query = make(map[string]string, len(p.Query))
|
|
for k, v := range p.Query {
|
|
copiedState.Query[k] = v
|
|
}
|
|
|
|
data := p.Props.InitialReduxState.Categories.Data
|
|
copiedData := make(map[string]int, len(data))
|
|
for k, v := range data {
|
|
copiedData[k] = v
|
|
}
|
|
copiedState.Props.InitialReduxState.Categories.Data = copiedData
|
|
|
|
return &copiedState
|
|
}
|
|
|
|
func (c *client) GetMainPageState() *PageState {
|
|
return c.cachedMainPageInfo.Clone()
|
|
}
|
|
|
|
func (c *client) getMainPageState(ctx context.Context) (*PageState, error) {
|
|
ctxLogger := restyCtxLogger{
|
|
ctx: ctx,
|
|
log: c.log,
|
|
}
|
|
|
|
req := c.http.R().
|
|
SetContext(ctx).
|
|
SetLogger(ctxLogger).
|
|
SetDoNotParseResponse(true).
|
|
EnableTrace()
|
|
|
|
resp, err := req.Get("/")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("getting request: %w", err)
|
|
}
|
|
|
|
if resp.IsError() {
|
|
c.log.ErrorContext(ctx, "unable to proceed request", slog.String("body", string(resp.Body())))
|
|
return nil, fmt.Errorf("got %d, but expected success: %w", resp.StatusCode(), domain.UnexpectedStatusError)
|
|
}
|
|
|
|
traceInfo := resp.Request.TraceInfo()
|
|
c.log.InfoContext(ctx, "request proceeded", slog.Any("trace", traceInfo))
|
|
|
|
return c.parsePageState(ctx, resp.RawBody())
|
|
}
|
|
|
|
func (c *client) parsePageState(ctx context.Context, body io.Reader) (*PageState, error) {
|
|
page, err := html.Parse(body)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("parsing body: %w", err)
|
|
}
|
|
|
|
c.log.DebugContext(ctx, "finding page state")
|
|
|
|
var (
|
|
foundHtml bool
|
|
foundBody bool
|
|
)
|
|
findFunc := func(node *html.Node) (found, deeper bool) {
|
|
if node == nil {
|
|
c.log.DebugContext(ctx, "node is null, skipping")
|
|
return false, false
|
|
}
|
|
|
|
if !foundHtml && node.Type == html.ElementNode {
|
|
c.log.DebugContext(ctx, "found html node")
|
|
foundHtml = true
|
|
return false, true
|
|
}
|
|
|
|
if foundHtml && !foundBody && node.DataAtom == atom.Body {
|
|
c.log.DebugContext(ctx, "found body node")
|
|
foundBody = true
|
|
return false, true
|
|
}
|
|
|
|
if foundHtml && foundBody && node.DataAtom == atom.Script {
|
|
for _, attr := range node.Attr {
|
|
if attr.Key == "id" && attr.Val == "__NEXT_DATA__" {
|
|
c.log.DebugContext(ctx, "found script node with next_data")
|
|
return true, false
|
|
}
|
|
}
|
|
}
|
|
|
|
return false, false
|
|
}
|
|
|
|
nextData := findNode(page, findFunc)
|
|
if nextData == nil {
|
|
return nil, nil
|
|
}
|
|
|
|
var out PageState
|
|
dataReader := strings.NewReader(nextData.FirstChild.Data)
|
|
err = json.NewDecoder(dataReader).Decode(&out)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("decoding html data: %w", err)
|
|
}
|
|
return &out, nil
|
|
}
|
|
|
|
func findNode(parent *html.Node, eq func(*html.Node) (found, deeper bool)) *html.Node {
|
|
for child := parent.FirstChild; child != nil; child = child.NextSibling {
|
|
found, deeper := eq(child)
|
|
if found {
|
|
return child
|
|
}
|
|
if deeper {
|
|
deeperChild := findNode(child, eq)
|
|
if deeperChild != nil {
|
|
return deeperChild
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|