colly源码包分析


基本用法

    infos := make([]info, 0)
    c := colly.NewCollector()
    c.OnHTML(".content div .rprt", func(e *colly.HTMLElement) {
        title := e.ChildText(".rslt p.title a")
        author := e.ChildText(".supp .desc")
        message := e.ChildText(".supp .details")
        pmid := e.ChildText(".aux .resc .rprtid dd")
        id, _ := strconv.ParseInt(pmid, 10, 64)
        i := info{
            title:   strings.TrimRight(title, "."),
            author:  author,
            message: message,
            pmid:    id,
        }
        infos = append(infos, i)
    })
    c.OnRequest(func(req *colly.Request) {
        log.Println("Visit:", req.URL.String())
    })
    c.Post("https://www.ncbi.nlm.nih.gov/pubmed", map[string]string{
        "term": "kif15",
        "p$a":  strconv.Itoa(3),
        "p$l":  "EntrezSystem2",
        "p$st": "pubmed",
        "EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.CurrPage": strconv.Itoa(3),
    })
    log.Println(len(infos))
    for _, inf := range infos {
        log.Printf("%+v", inf)
    }

代码分析

架构逻辑

collector结构体

// Collector 为收集html工作提供收集实例
type Collector struct {
    // UserAgent 是HTTP请求使用的User-Agent字符串
    UserAgent string
    // MaxDepth 限制了访问过的URL的递归深度。
    // 将其设置为0以进行无限递归(默认)。
    MaxDepth int
    // AllowedDomains 是一个域名白名单。
    // 将其留空以允许访问任何域
    AllowedDomains []string
    // DisallowedDomains 是一个域名黑名单。
    DisallowedDomains []string
    // DisallowedURLFilters是一个限制的正则表达式列表
    // 访问网址如果任何规则与URL匹配
    // 请求将被停止。 DisallowedURLFilters会
    // 在URLFilters之前进行评估
    // 将其留空以允许访问任何URL
    DisallowedURLFilters []*regexp.Regexp
    // URLFilters 是一个限制访问网站的正则表达式列表。
    // 如果网站和任意一个规则所匹配,访问将不会被停止
    // DisallowedURLFilters 将会在 URLFilters 之前被评估
    // 使其为空将允许所有网址访问
    URLFilters []*regexp.Regexp

    // AllowURLRevisit 允许多次下载同一个网址
    AllowURLRevisit bool
    // MaxBodySize 是检索字节响应体的限制。
    // 0意味着不做限制。
    // MaxBodySize的默认值是10兆 (10 * 1024 * 1024 bytes).
    MaxBodySize int
    // CacheDir specifies a location where GET requests are cached as files.
    // When it's not defined, caching is disabled.
    CacheDir string
    // IgnoreRobotsTxt allows the Collector to ignore any restrictions set by
    // the target host's robots.txt file.  See http://www.robotstxt.org/ for more
    // information.
    IgnoreRobotsTxt bool
    // Async turns on asynchronous network communication. Use Collector.Wait() to
    // be sure all requests have been finished.
    Async bool
    // ParseHTTPErrorResponse allows parsing HTTP responses with non 2xx status codes.
    // By default, Colly parses only successful HTTP responses. Set ParseHTTPErrorResponse
    // to true to enable it.
    ParseHTTPErrorResponse bool
    // ID is the unique identifier of a collector
    ID uint32
    // DetectCharset can enable character encoding detection for non-utf8 response bodies
    // without explicit charset declaration. This feature uses https://github.com/saintfish/chardet
    DetectCharset bool
    // RedirectHandler allows control on how a redirect will be managed
    RedirectHandler   func(req *http.Request, via []*http.Request) error
    store             storage.Storage
    debugger          debug.Debugger
    robotsMap         map[string]*robotstxt.RobotsData
    htmlCallbacks     []*htmlCallbackContainer
    xmlCallbacks      []*xmlCallbackContainer
    requestCallbacks  []RequestCallback
    responseCallbacks []ResponseCallback
    errorCallbacks    []ErrorCallback
    scrapedCallbacks  []ScrapedCallback
    requestCount      uint32
    responseCount     uint32
    backend           *httpBackend
    wg                *sync.WaitGroup
    lock              *sync.RWMutex
}

实例创建后,可以通过实例,对目标网址进行访问

c := colly.NewCollector()
// visit 访问,类型为get
c.Visit("url")
// post 访问,可带post请求参数params [string]string
c.Post("url",params)

访问调用的是golang自带的http包,通过请求,返回http响应,将http的响应body全部读取,得到的字节流封装成colly.response返回

将字节流进行解析,得到标签+属性列表的节点Node

//伪代码
Node{
    selector string // 标签
    attrs [string]string // 属性列表
}

解析后返回的节点分析 Parent 为父节点,唯一的不会改变 FirstChild 第一个子节点,也是唯一的 LastChild 在每次封装当前节点的时候会改变 PrevSibling 当前结点的上个节点 NextSibling 当前节点的下个节点 NodeType 节点类型 Namespace 标签 Attr 属性

type Node struct {
    Parent, FirstChild, LastChild, PrevSibling, NextSibling *Node

    Type      NodeType
    DataAtom  atom.Atom
    Data      string
    Namespace string
    Attr      []Attribute
}

具体封装,通过parser进行解析, Tokenizer 标签生成器,提供标签,字节流通过这个读取 doc 用来封装节点信息

type parser struct {
    // tokenizer provides the tokens for the parser.
    tokenizer *Tokenizer
    // tok is the most recently read token.
    tok Token
    // Self-closing tags like <hr/> are treated as start tags, except that
    // hasSelfClosingToken is set while they are being processed.
    hasSelfClosingToken bool
    // doc is the document root element.
    doc *Node
    // The stack of open elements (section 12.2.4.2) and active formatting
    // elements (section 12.2.4.3).
    oe, afe nodeStack
    // Element pointers (section 12.2.4.4).
    head, form *Node
    // Other parsing state flags (section 12.2.4.5).
    scripting, framesetOK bool
    // The stack of template insertion modes
    templateStack insertionModeStack
    // im is the current insertion mode.
    im insertionMode
    // originalIM is the insertion mode to go back to after completing a text
    // or inTableText insertion mode.
    originalIM insertionMode
    // fosterParenting is whether new elements should be inserted according to
    // the foster parenting rules (section 12.2.6.1).
    fosterParenting bool
    // quirks is whether the parser is operating in "quirks mode."
    quirks bool
    // fragment is whether the parser is parsing an HTML fragment.
    fragment bool
    // context is the context element when parsing an HTML fragment
    // (section 12.4).
    context *Node
}

See also