Class: Spidr::Agent

Inherits:
Object
  • Object
show all
Includes:
Spidr::Actions, Spidr::Events, Spidr::Filters, Spidr::Sanitizers
Defined in:
lib/spidr/agent.rb

Instance Attribute Summary

Class Method Summary

Instance Method Summary

Methods included from Spidr::Sanitizers

included, #sanitize_url

Methods included from Spidr::Filters

#ignore_exts, #ignore_exts_like, #ignore_hosts, #ignore_hosts_like, #ignore_links, #ignore_links_like, #ignore_ports, #ignore_ports_like, included, #schemes=, #visit_ext?, #visit_exts, #visit_exts_like, #visit_host?, #visit_hosts, #visit_hosts_like, #visit_link?, #visit_links, #visit_links_like, #visit_port?, #visit_ports, #visit_ports_like, #visit_scheme?

Methods included from Spidr::Events

#all_headers, #every_atom_doc, #every_atom_page, #every_bad_request_page, #every_css_page, #every_doc, #every_failed_url, #every_forbidden_page, #every_html_doc, #every_html_page, #every_internal_server_error_page, #every_javascript_page, #every_missing_page, #every_ms_word_page, #every_ok_page, #every_page, #every_pdf_page, #every_redirect_page, #every_rss_doc, #every_rss_page, #every_timedout_page, #every_txt_page, #every_unauthorized_page, #every_url, #every_xml_doc, #every_xml_page, #every_xsl_doc, #every_xsl_page, #every_zip_page, #urls_like

Methods included from Spidr::Actions

#continue!, #pause!, #pause=, #paused?, #skip_link!, #skip_page!

Constructor Details

- (Agent) initialize(options = {}, &block) {|agent| ... }

Creates a new Agent object.

Parameters:

  • (Hash) options (defaults to: {}) — Additional options

Options Hash (options):

  • (Hash) :proxy — default: Spidr.proxy — The proxy information to use.
  • (String) :user_agent — default: Spidr.user_agent — The User-Agent string to send with each requests.
  • (String) :referer N/A — The Referer URL to send with each request.
  • (Integer) :delay — default: 0 — The number of seconds to pause between each request.
  • (Set, Array) :queue N/A — The initial queue of URLs to visit.
  • (Set, Array) :history N/A — The initial list of visited URLs.

Yields:

  • (agent) — If a block is given, it will be passed the newly created agent for further configuration.

Yield Parameters:

  • (Agent) agent — The newly created agent.


89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/spidr/agent.rb', line 89

def initialize(options={},&block)
  @user_agent = (options[:user_agent] || Spidr.user_agent)
  @referer = options[:referer]

  @sessions = SessionCache.new(options[:proxy] || Spidr.proxy)
  @cookies = CookieJar.new
  @authorized = AuthStore.new

  @running = false
  @delay = (options[:delay] || 0)
  @history = Set[]
  @failures = Set[]
  @queue = []

  super(options)

  block.call(self) if block
end

Instance Attribute Details

- (Object) authorized

HTTP Authentication credentials



26
27
28
# File 'lib/spidr/agent.rb', line 26

def authorized
  @authorized
end

- (Object) cookies (readonly)

Cached cookies



44
45
46
# File 'lib/spidr/agent.rb', line 44

def cookies
  @cookies
end

- (Object) delay

Delay in between fetching pages



32
33
34
# File 'lib/spidr/agent.rb', line 32

def delay
  @delay
end

- (Object) failures (readonly)

List of unreachable URLs



38
39
40
# File 'lib/spidr/agent.rb', line 38

def failures
  @failures
end

- (Object) history (readonly) Also known as: visited_urls

History containing visited URLs



35
36
37
# File 'lib/spidr/agent.rb', line 35

def history
  @history
end

- (Object) queue (readonly) Also known as: pending_urls

Queue of URLs to visit



41
42
43
# File 'lib/spidr/agent.rb', line 41

def queue
  @queue
end

- (Object) referer

Referer to use



29
30
31
# File 'lib/spidr/agent.rb', line 29

def referer
  @referer
end

- (Object) user_agent

User-Agent to use



23
24
25
# File 'lib/spidr/agent.rb', line 23

def user_agent
  @user_agent
end

Class Method Details

+ (Object) host(name, options = {}, &block) {|agent| ... }

Creates a new agent and spiders the given host.

Parameters:

  • (String) The — host-name to spider.
  • (Hash) options (defaults to: {}) — Additional options. See Agent#initialize.

Yields:

  • (agent) — If a block is given, it will be passed the newly created agent before it begins spidering.

Yield Parameters:

  • (Agent) agent — The newly created agent.


148
149
150
151
152
153
154
# File 'lib/spidr/agent.rb', line 148

def self.host(name,options={},&block)
  self.new(options.merge(:host => name)) do |spider|
    block.call(spider) if block

    spider.start_at("http://#{name}/")
  end
end

+ (Object) site(url, options = {}, &block) {|agent| ... }

Creates a new agent and spiders the web-site located at the given URL.

Parameters:

  • (URI::HTTP, String) url — The web-site to spider.
  • (Hash) options (defaults to: {}) — Additional options. See Agent#initialize.

Yields:

  • (agent) — If a block is given, it will be passed the newly created agent before it begins spidering.

Yield Parameters:

  • (Agent) agent — The newly created agent.


172
173
174
175
176
177
178
179
180
# File 'lib/spidr/agent.rb', line 172

def self.site(url,options={},&block)
  url = URI(url.to_s)

  return self.new(options.merge(:host => url.host)) do |spider|
    block.call(spider) if block

    spider.start_at(url)
  end
end

+ (Object) start_at(url, options = {}, &block) {|agent| ... }

Creates a new agent and begin spidering at the given URL.

Parameters:

  • (URI::HTTP, String) url — The URL to start spidering at.
  • (Hash) options (defaults to: {}) — Additional options. See Agent#initialize.

Yields:

  • (agent) — If a block is given, it will be passed the newly created agent before it begins spidering.

Yield Parameters:

  • (Agent) agent — The newly created agent.


124
125
126
127
128
129
130
# File 'lib/spidr/agent.rb', line 124

def self.start_at(url,options={},&block)
  self.new(options) do |spider|
    block.call(spider) if block

    spider.start_at(url)
  end
end

Instance Method Details

- (Object) clear

Clears the history of the agent.



185
186
187
188
189
190
# File 'lib/spidr/agent.rb', line 185

def clear
  @queue.clear
  @history.clear
  @failures.clear
  return self
end

- (URI::HTTP) dequeue (protected)

Dequeues a URL that will later be visited.

Returns:

  • (URI::HTTP) — The URL that was at the front of the queue.


647
648
649
# File 'lib/spidr/agent.rb', line 647

def dequeue
  @queue.shift
end

- (Boolean) enqueue(url)

Enqueues a given URL for visiting, only if it passes all of the agent’s rules for visiting a given URL.

Parameters:

  • (URI::HTTP, String) url — The URL to enqueue for visiting.

Returns:

  • (Boolean) — Specifies whether the URL was enqueued, or ignored.


434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
# File 'lib/spidr/agent.rb', line 434

def enqueue(url)
  url = sanitize_url(url)

  if (!(queued?(url)) && visit?(url))
    link = url.to_s

    begin
      @every_url_blocks.each { |block| block.call(url) }

      @urls_like_blocks.each do |pattern,blocks|
        if ((pattern.kind_of?(Regexp) && link =~ pattern) || pattern == link || pattern == url)
          blocks.each { |url_block| url_block.call(url) }
        end
      end
    rescue Actions::Paused => action
      raise(action)
    rescue Actions::SkipLink
      return false
    rescue Actions::Action
    end

    @queue << url
    return true
  end

  return false
end

- (Object) failed(url) (protected)

Adds a given URL to the failures list.

Parameters:

  • (URI::HTTP) url — The URL to add to the failures list.


675
676
677
678
679
# File 'lib/spidr/agent.rb', line 675

def failed(url)
  @failures << url
  @every_failed_url_blocks.each { |block| block.call(url) }
  return true
end

- (Boolean) failed?(url)

Determines whether a given URL could not be visited.

Parameters:

  • (URI::HTTP, String) url — The URL to check for failures.

Returns:

  • (Boolean) — Specifies whether the given URL was unable to be visited.


377
378
379
380
381
# File 'lib/spidr/agent.rb', line 377

def failed?(url)
  url = URI(url.to_s) unless url.kind_of?(URI)

  return @failures.include?(url)
end

- (Array<URI::HTTP>) failures=(new_failures)

Sets the list of failed URLs.

Examples:

  agent.failures = ['http://localhost/']

Parameters:

  • (#each) The — new list of failed URLs.

Returns:

  • (Array<URI::HTTP>) — The list of failed URLs.


354
355
356
357
358
359
360
361
362
363
364
365
366
# File 'lib/spidr/agent.rb', line 354

def failures=(new_failures)
  @failures.clear

  new_failures.each do |url|
    @failures << unless url.kind_of?(URI)
                URI(url.to_s)
              else
                url
              end
  end

  return @failures
end

- (Page?) get_page(url, &block) {|page| ... }

Requests and creates a new Page object from a given URL.

Parameters:

  • (URI::HTTP) url — The URL to request.

Yields:

  • (page) — If a block is given, it will be passed the page that represents the response.

Yield Parameters:

  • (Page) page — The page for the response.

Returns:

  • (Page, nil) — The page for the response, or nil if the request failed.


478
479
480
481
482
483
484
485
486
487
488
489
490
# File 'lib/spidr/agent.rb', line 478

def get_page(url,&block)
  url = URI(url.to_s)

  prepare_request(url) do |session,path,headers|
    new_page = Page.new(url,session.get(path,headers))

    # save any new cookies
    @cookies.from_page(new_page)

    block.call(new_page) if block
    return new_page
  end
end

- (Set<URI::HTTP>) history=(new_history)

Sets the history of URLs that were previously visited.

Examples:

  agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/']

Parameters:

  • (#each) new_history — A list of URLs to populate the history with.

Returns:

  • (Set<URI::HTTP>) — The history of the agent.


291
292
293
294
295
296
297
298
299
300
301
302
303
# File 'lib/spidr/agent.rb', line 291

def history=(new_history)
  @history.clear

  new_history.each do |url|
    @history << unless url.kind_of?(URI)
                  URI(url.to_s)
                else
                  url
                end
  end

  return @history
end

- (Page?) post_page(url, post_data = '', &block) {|page| ... }

Posts supplied form data and creates a new Page object from a given URL.

Parameters:

  • (URI::HTTP) url — The URL to request.
  • (String) post_data (defaults to: '') — Form option data.

Yields:

  • (page) — If a block is given, it will be passed the page that represents the response.

Yield Parameters:

  • (Page) page — The page for the response.

Returns:

  • (Page, nil) — The page for the response, or nil if the request failed.

Since:

  • 0.2.2


513
514
515
516
517
518
519
520
521
522
523
524
525
# File 'lib/spidr/agent.rb', line 513

def post_page(url,post_data='',&block)
  url = URI(url.to_s)

  prepare_request(url) do |session,path,headers|
    new_page = Page.new(url,session.post(path,post_data,headers))

    # save any new cookies
    @cookies.from_page(new_page)

    block.call(new_page) if block
    return new_page
  end
end

- (Object) prepare_request(url, &block) {|request| ... } (protected)

Normalizes the request path and grabs a session to handle page get and post requests.

Parameters:

  • (URI::HTTP) url — The URL to request.

Yields:

  • (request) — A block whose purpose is to make a page request.

Yield Parameters:

  • (Net::HTTP) session — An HTTP session object.
  • (String) path — Normalized URL string.
  • (Hash) headers — A Hash of request header options.

Since:

  • 0.2.2


599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
# File 'lib/spidr/agent.rb', line 599

def prepare_request(url,&block)
  host = url.host
  port = url.port

  unless url.path.empty?
    path = url.path
  else
    path = '/'
  end

  # append the URL query to the path
  path += "?#{url.query}" if url.query

  begin
    sleep(@delay) if @delay > 0

    headers = {}
    headers['User-Agent'] = @user_agent if @user_agent
    headers['Referer'] = @referer if @referer

    if (authorization = @authorized.for_url(url))
      headers['Authorization'] = "Basic #{authorization}"
    end

    if (header_cookies = @cookies.for_host(url.host))
      headers['Cookie'] = header_cookies
    end

    block.call(@sessions[url],path,headers)
  rescue SystemCallError,
         Timeout::Error,
         SocketError,
         Net::HTTPBadResponse,
         IOError

    @sessions.kill!(url)

    failed(url)
    return nil
  end
end

- (Hash) proxy

The proxy information the agent uses.

Returns:

  • (Hash) — The proxy information.

See Also:

Since:

  • 0.2.2


258
259
260
# File 'lib/spidr/agent.rb', line 258

def proxy
  @sessions.proxy
end

- (Hash) proxy=(new_proxy)

Sets the proxy information that the agent uses.

Parameters:

  • (Hash) new_proxy — The new proxy information.

Returns:

  • (Hash) — The new proxy information.

See Also:

Since:

  • 0.2.2


275
276
277
# File 'lib/spidr/agent.rb', line 275

def proxy=(new_proxy)
  @sessions.proxy = new_proxy
end

- (Array<URI::HTTP>) queue=(new_queue)

Sets the queue of URLs to visit.

Examples:

  agent.queue = ['http://www.vimeo.com/', 'http://www.reddit.com/']

Parameters:

  • (#each) The — new list of URLs to visit.

Returns:

  • (Array<URI::HTTP>) — The list of URLs to visit.


397
398
399
400
401
402
403
404
405
406
407
408
409
# File 'lib/spidr/agent.rb', line 397

def queue=(new_queue)
  @queue.clear

  new_queue.each do |url|
    @queue << unless url.kind_of?(URI)
                URI(url.to_s)
              else
                url
              end
  end

  return @queue
end

- (Boolean) queued?(url)

Determines whether a given URL has been enqueued.

Parameters:

  • (URI::HTTP) url — The URL to search for in the queue.

Returns:

  • (Boolean) — Specifies whether the given URL has been queued for visiting.


420
421
422
# File 'lib/spidr/agent.rb', line 420

def queued?(url)
  @queue.include?(url)
end

- (Object) run(&block) {|page| ... }

Start spidering until the queue becomes empty or the agent is paused.

Yields:

  • (page) — If a block is given, it will be passed every page visited.

Yield Parameters:

  • (Page) page — A page which has been visited.


220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# File 'lib/spidr/agent.rb', line 220

def run(&block)
  @running = true

  until (@queue.empty? || paused?)
    begin
      visit_page(dequeue,&block)
    rescue Actions::Paused
      return self
    rescue Actions::Action
    end
  end

  @running = false

  @sessions.clear
  return self
end

- (Boolean) running?

Determines if the agent is running.

Returns:

  • (Boolean) — Specifies whether the agent is running or stopped.


244
245
246
# File 'lib/spidr/agent.rb', line 244

def running?
  @running == true
end

- (Object) start_at(url, &block) {|page| ... }

Start spidering at a given URL.

Parameters:

  • (URI::HTTP, String) url — The URL to start spidering at.

Yields:

  • (page) — If a block is given, it will be passed every page visited.

Yield Parameters:

  • (Page) page — A page which has been visited.


204
205
206
207
208
# File 'lib/spidr/agent.rb', line 204

def start_at(url,&block)
  enqueue(url)

  return run(&block)
end

- (Hash) to_hash

Converts the agent into a Hash.

Returns:

  • (Hash) — The agent represented as a Hash containing the history and the queue of the agent.


572
573
574
# File 'lib/spidr/agent.rb', line 572

def to_hash
  {:history => @history, :queue => @queue}
end

- (Boolean) visit?(url) (protected)

Determines if a given URL should be visited.

Parameters:

  • (URI::HTTP) url — The URL in question.

Returns:

  • (Boolean) — Specifies whether the given URL should be visited.


660
661
662
663
664
665
666
667
# File 'lib/spidr/agent.rb', line 660

def visit?(url)
  (!(visited?(url)) &&
   visit_scheme?(url.scheme) &&
   visit_host?(url.host) &&
   visit_port?(url.port) &&
   visit_link?(url.to_s) &&
   visit_ext?(url.path))
end

- (Page?) visit_page(url, &block) {|page| ... }

Visits a given URL, and enqueus the links recovered from the URL to be visited later.

Parameters:

  • (URI::HTTP, String) url — The URL to visit.

Yields:

  • (page) — If a block is given, it will be passed the page which was visited.

Yield Parameters:

  • (Page) page — The page which was visited.

Returns:

  • (Page, nil) — The page that was visited. If nil is returned, either the request for the page failed, or the page was skipped.


544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
# File 'lib/spidr/agent.rb', line 544

def visit_page(url,&block)
  url = URI(url.to_s) unless url.kind_of?(URI)

  get_page(url) do |page|
    @history << page.url

    begin
      @every_page_blocks.each { |page_block| page_block.call(page) }

      block.call(page) if block
    rescue Actions::Paused => action
      raise(action)
    rescue Actions::SkipPage
      return nil
    rescue Actions::Action
    end

    page.urls.each { |next_url| enqueue(next_url) }
  end
end

- (Boolean) visited?(url)

Determines whether a URL was visited or not.

Parameters:

  • (URI::HTTP, String) url — The URL to search for.

Returns:

  • (Boolean) — Specifies whether a URL was visited.


336
337
338
339
340
# File 'lib/spidr/agent.rb', line 336

def visited?(url)
  url = URI(url.to_s) unless url.kind_of?(URI)

  return @history.include?(url)
end

- (Array<String>) visited_hosts

Specifies all hosts that were visited.

Returns:

  • (Array<String>) — The hosts which have been visited.


323
324
325
# File 'lib/spidr/agent.rb', line 323

def visited_hosts
  visited_urls.map { |uri| uri.host }.uniq
end

Specifies the links which have been visited.

Returns:

  • (Array<String>) — The links which have been visited.


313
314
315
# File 'lib/spidr/agent.rb', line 313

def visited_links
  @history.map { |url| url.to_s }
end