基本用法
infos := make([]info, 0)
c := colly.NewCollector()
c.OnHTML(".content div .rprt", func(e *colly.HTMLElement) {
title := e.ChildText(".rslt p.title a")
author := e.ChildText(".supp .desc")
message := e.ChildText(".supp .details")
pmid := e.ChildText(".aux .resc .rprtid dd")
id, _ := strconv.ParseInt(pmid, 10, 64)
i := info{
title: strings.TrimRight(title, "."),
author: author,
message: message,
pmid: id,
}
infos = append(infos, i)
})
c.OnRequest(func(req *colly.Request) {
log.Println("Visit:", req.URL.String())
})
c.Post("https://www.ncbi.nlm.nih.gov/pubmed", map[string]string{
"term": "kif15",
"p$a": strconv.Itoa(3),
"p$l": "EntrezSystem2",
"p$st": "pubmed",
"EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.CurrPage": strconv.Itoa(3),
})
log.Println(len(infos))
for _, inf := range infos {
log.Printf("%+v", inf)
}
代码分析
架构逻辑
collector结构体
// Collector 为收集html工作提供收集实例
type Collector struct {
// UserAgent 是HTTP请求使用的User-Agent字符串
UserAgent string
// MaxDepth 限制了访问过的URL的递归深度。
// 将其设置为0以进行无限递归(默认)。
MaxDepth int
// AllowedDomains 是一个域名白名单。
// 将其留空以允许访问任何域
AllowedDomains []string
// DisallowedDomains 是一个域名黑名单。
DisallowedDomains []string
// DisallowedURLFilters是一个限制的正则表达式列表
// 访问网址如果任何规则与URL匹配
// 请求将被停止。 DisallowedURLFilters会
// 在URLFilters之前进行评估
// 将其留空以允许访问任何URL
DisallowedURLFilters []*regexp.Regexp
// URLFilters 是一个限制访问网站的正则表达式列表。
// 如果网站和任意一个规则所匹配,访问将不会被停止
// DisallowedURLFilters 将会在 URLFilters 之前被评估
// 使其为空将允许所有网址访问
URLFilters []*regexp.Regexp
// AllowURLRevisit 允许多次下载同一个网址
AllowURLRevisit bool
// MaxBodySize 是检索字节响应体的限制。
// 0意味着不做限制。
// MaxBodySize的默认值是10兆 (10 * 1024 * 1024 bytes).
MaxBodySize int
// CacheDir specifies a location where GET requests are cached as files.
// When it's not defined, caching is disabled.
CacheDir string
// IgnoreRobotsTxt allows the Collector to ignore any restrictions set by
// the target host's robots.txt file. See http://www.robotstxt.org/ for more
// information.
IgnoreRobotsTxt bool
// Async turns on asynchronous network communication. Use Collector.Wait() to
// be sure all requests have been finished.
Async bool
// ParseHTTPErrorResponse allows parsing HTTP responses with non 2xx status codes.
// By default, Colly parses only successful HTTP responses. Set ParseHTTPErrorResponse
// to true to enable it.
ParseHTTPErrorResponse bool
// ID is the unique identifier of a collector
ID uint32
// DetectCharset can enable character encoding detection for non-utf8 response bodies
// without explicit charset declaration. This feature uses https://github.com/saintfish/chardet
DetectCharset bool
// RedirectHandler allows control on how a redirect will be managed
RedirectHandler func(req *http.Request, via []*http.Request) error
store storage.Storage
debugger debug.Debugger
robotsMap map[string]*robotstxt.RobotsData
htmlCallbacks []*htmlCallbackContainer
xmlCallbacks []*xmlCallbackContainer
requestCallbacks []RequestCallback
responseCallbacks []ResponseCallback
errorCallbacks []ErrorCallback
scrapedCallbacks []ScrapedCallback
requestCount uint32
responseCount uint32
backend *httpBackend
wg *sync.WaitGroup
lock *sync.RWMutex
}
实例创建后,可以通过实例,对目标网址进行访问
c := colly.NewCollector()
// visit 访问,类型为get
c.Visit("url")
// post 访问,可带post请求参数params [string]string
c.Post("url",params)
访问调用的是golang自带的http包,通过请求,返回http响应,将http的响应body全部读取,得到的字节流封装成colly.response返回
将字节流进行解析,得到标签+属性列表的节点Node
//伪代码
Node{
selector string // 标签
attrs [string]string // 属性列表
}
解析后返回的节点分析 Parent 为父节点,唯一的不会改变 FirstChild 第一个子节点,也是唯一的 LastChild 在每次封装当前节点的时候会改变 PrevSibling 当前结点的上个节点 NextSibling 当前节点的下个节点 NodeType 节点类型 Namespace 标签 Attr 属性
type Node struct {
Parent, FirstChild, LastChild, PrevSibling, NextSibling *Node
Type NodeType
DataAtom atom.Atom
Data string
Namespace string
Attr []Attribute
}
具体封装,通过parser进行解析, Tokenizer 标签生成器,提供标签,字节流通过这个读取 doc 用来封装节点信息
type parser struct {
// tokenizer provides the tokens for the parser.
tokenizer *Tokenizer
// tok is the most recently read token.
tok Token
// Self-closing tags like <hr/> are treated as start tags, except that
// hasSelfClosingToken is set while they are being processed.
hasSelfClosingToken bool
// doc is the document root element.
doc *Node
// The stack of open elements (section 12.2.4.2) and active formatting
// elements (section 12.2.4.3).
oe, afe nodeStack
// Element pointers (section 12.2.4.4).
head, form *Node
// Other parsing state flags (section 12.2.4.5).
scripting, framesetOK bool
// The stack of template insertion modes
templateStack insertionModeStack
// im is the current insertion mode.
im insertionMode
// originalIM is the insertion mode to go back to after completing a text
// or inTableText insertion mode.
originalIM insertionMode
// fosterParenting is whether new elements should be inserted according to
// the foster parenting rules (section 12.2.6.1).
fosterParenting bool
// quirks is whether the parser is operating in "quirks mode."
quirks bool
// fragment is whether the parser is parsing an HTML fragment.
fragment bool
// context is the context element when parsing an HTML fragment
// (section 12.4).
context *Node
}