Module: Spidr::Filters

Defined in:
lib/spidr/filters.rb

Class Method Summary

Instance Method Summary

Class Method Details

+ (Object) included(base)



5
6
7
8
9
10
# File 'lib/spidr/filters.rb', line 5

def self.included(base)
  base.module_eval do
    # List of acceptable URL schemes to follow
    attr_reader :schemes
  end
end

Instance Method Details

- (Array<String, Regexp, Proc>) ignore_exts

Specifies the patterns that match URI path extensions to not visit.

Returns:

  • (Array<String, Regexp, Proc>) — The URI path extension patterns to not visit.


343
344
345
# File 'lib/spidr/filters.rb', line 343

def ignore_exts
  @ext_rules.reject
end

- (Object) ignore_exts_like(pattern = nil, &block) {|ext| ... }

Adds a given pattern to the ignore_exts.

Parameters:

  • (String, Regexp) pattern (defaults to: nil) — The pattern to match URI path extensions with.

Yields:

  • (ext) — If a block is given, it will be used to filter URI path extensions.

Yield Parameters:

  • (String) ext — A URI path extension to reject or accept.


359
360
361
362
363
364
365
366
367
# File 'lib/spidr/filters.rb', line 359

def ignore_exts_like(pattern=nil,&block)
  if pattern
    ignore_exts << pattern
  elsif block
    ignore_exts << block
  end

  return self
end

- (Array<String, Regexp, Proc>) ignore_hosts

Specifies the patterns that match host-names to not visit.

Returns:

  • (Array<String, Regexp, Proc>) — The host-name patterns to not visit.


151
152
153
# File 'lib/spidr/filters.rb', line 151

def ignore_hosts
  @host_rules.reject
end

- (Object) ignore_hosts_like(pattern = nil, &block) {|host| ... }

Adds a given pattern to the ignore_hosts.

Parameters:

  • (String, Regexp) pattern (defaults to: nil) — The pattern to match host-names with.

Yields:

  • (host) — If a block is given, it will be used to filter host-names.

Yield Parameters:

  • (String) host — A host-name to reject or accept.


167
168
169
170
171
172
173
174
175
# File 'lib/spidr/filters.rb', line 167

def ignore_hosts_like(pattern=nil,&block)
  if pattern
    ignore_hosts << pattern
  elsif block
    ignore_hosts << block
  end

  return self
end

Specifies the patterns that match links to not visit.

Returns:

  • (Array<String, Regexp, Proc>) — The link patterns to not visit.


279
280
281
# File 'lib/spidr/filters.rb', line 279

def ignore_links
  @link_rules.reject
end

Adds a given pattern to the ignore_links.

Parameters:

  • (String, Regexp) pattern (defaults to: nil) — The pattern to match links with.

Yields:

  • (link) — If a block is given, it will be used to filter links.

Yield Parameters:

  • (String) link — A link to reject or accept.


295
296
297
298
299
300
301
302
303
# File 'lib/spidr/filters.rb', line 295

def ignore_links_like(pattern=nil,&block)
  if pattern
    ignore_links << pattern
  elsif block
    ignore_links << block
  end

  return self
end

- (Array<Integer, Regexp, Proc>) ignore_ports

Specifies the patterns that match ports to not visit.

Returns:

  • (Array<Integer, Regexp, Proc>) — The port patterns to not visit.


215
216
217
# File 'lib/spidr/filters.rb', line 215

def ignore_ports
  @port_rules.reject
end

- (Object) ignore_ports_like(pattern = nil, &block) {|port| ... }

Adds a given pattern to the ignore_ports.

Parameters:

  • (Integer, Regexp) pattern (defaults to: nil) — The pattern to match ports with.

Yields:

  • (port) — If a block is given, it will be used to filter ports.

Yield Parameters:

  • (Integer) port — A port to reject or accept.


231
232
233
234
235
236
237
238
239
# File 'lib/spidr/filters.rb', line 231

def ignore_ports_like(pattern=nil,&block)
  if pattern
    ignore_ports << pattern
  elsif block
    ignore_ports << block
  end

  return self
end

- (Filters) initialize(options = {})

Initializes filtering rules.

Parameters:

  • (Hash) options (defaults to: {}) — Additional options.

Options Hash (options):

  • (Array) :schemes — default: ['http', 'https'] — The list of acceptable URI schemes to visit. The https scheme will be ignored if net/https cannot be loaded.
  • (String) :host N/A — The host-name to visit.
  • (Array<String, Regexp, Proc>) :hosts N/A — The patterns which match the host-names to visit.
  • (Array<String, Regexp, Proc>) :ignore_hosts N/A — The patterns which match the host-names to not visit.
  • (Array<Integer, Regexp, Proc>) :ports N/A — The patterns which match the ports to visit.
  • (Array<Integer, Regexp, Proc>) :ignore_ports N/A — The patterns which match the ports to not visit.
  • (Array<String, Regexp, Proc>) :links N/A — The patterns which match the links to visit.
  • (Array<String, Regexp, Proc>) :ignore_links N/A — The patterns which match the links to not visit.
  • (Array<String, Regexp, Proc>) :exts N/A — The patterns which match the URI path extensions to visit.
  • (Array<String, Regexp, Proc>) :ignore_exts N/A — The patterns which match the URI path extensions to not visit.


49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/spidr/filters.rb', line 49

def initialize(options={})
  super(options)

  @schemes = []

  if options[:schemes]
    @schemes += options[:schemes]
  else
    @schemes << 'http'

    begin
      require 'net/https'

      @schemes << 'https'
    rescue Gem::LoadError => e
      raise(e)
    rescue ::LoadError
      STDERR.puts "Warning: cannot load 'net/https', https support disabled"
    end
  end

  @host_rules = Rules.new(
    :accept => options[:hosts],
    :reject => options[:ignore_hosts]
  )
  @port_rules = Rules.new(
    :accept => options[:ports],
    :reject => options[:ignore_ports]
  )
  @link_rules = Rules.new(
    :accept => options[:links],
    :reject => options[:ignore_links]
  )
  @ext_rules = Rules.new(
    :accept => options[:exts],
    :reject => options[:ignore_exts]
  )

  if options[:host]
    visit_hosts_like(options[:host])
  end

  if options[:queue]
    self.queue = options[:queue]
  end

  if options[:history]
    self.history = options[:history]
  end
end

- (Object) schemes=(new_schemes)

Sets the list of acceptable URL schemes to visit.

Examples:

  agent.schemes = ['http']

Parameters:

  • (Array) new_schemes — The new schemes to visit.


109
110
111
# File 'lib/spidr/filters.rb', line 109

def schemes=(new_schemes)
  @schemes = new_schemes.map { |scheme| scheme.to_s }
end

- (Boolean) visit_ext?(path) (protected)

Determines if a given URI path extension should be visited.

Parameters:

  • (String) path — The path that contains the extension.

Returns:

  • (Boolean) — Specifies whether the given URI path extension should be visited.


436
437
438
# File 'lib/spidr/filters.rb', line 436

def visit_ext?(path)
  @ext_rules.accept?(File.extname(path)[1..-1])
end

- (Array<String, Regexp, Proc>) visit_exts

Specifies the patterns that match the URI path extensions to visit.

Returns:

  • (Array<String, Regexp, Proc>) — The URI path extensions patterns to visit.


311
312
313
# File 'lib/spidr/filters.rb', line 311

def visit_exts
  @ext_rules.accept
end

- (Object) visit_exts_like(pattern = nil, &block) {|ext| ... }

Adds a given pattern to the visit_exts.

Parameters:

  • (String, Regexp) pattern (defaults to: nil) — The pattern to match URI path extensions with.

Yields:

  • (ext) — If a block is given, it will be used to filter URI path extensions.

Yield Parameters:

  • (String) ext — A URI path extension to accept or reject.


327
328
329
330
331
332
333
334
335
# File 'lib/spidr/filters.rb', line 327

def visit_exts_like(pattern=nil,&block)
  if pattern
    visit_exts << pattern
  elsif block
    visit_exts << block
  end

  return self
end

- (Boolean) visit_host?(host) (protected)

Determines if a given host-name should be visited.

Parameters:

  • (String) host — The host-name.

Returns:

  • (Boolean) — Specifies whether the given host-name should be visited.


397
398
399
# File 'lib/spidr/filters.rb', line 397

def visit_host?(host)
  @host_rules.accept?(host)
end

- (Array<String, Regexp, Proc>) visit_hosts

Specifies the patterns that match host-names to visit.

Returns:

  • (Array<String, Regexp, Proc>) — The host-name patterns to visit.


119
120
121
# File 'lib/spidr/filters.rb', line 119

def visit_hosts
  @host_rules.accept
end

- (Object) visit_hosts_like(pattern = nil, &block) {|host| ... }

Adds a given pattern to the visit_hosts.

Parameters:

  • (String, Regexp) pattern (defaults to: nil) — The pattern to match host-names with.

Yields:

  • (host) — If a block is given, it will be used to filter host-names.

Yield Parameters:

  • (String) host — A host-name to accept or reject.


135
136
137
138
139
140
141
142
143
# File 'lib/spidr/filters.rb', line 135

def visit_hosts_like(pattern=nil,&block)
  if pattern
    visit_hosts << pattern
  elsif block
    visit_hosts << block
  end

  return self
end

- (Boolean) visit_link?(link) (protected)

Determines if a given link should be visited.

Parameters:

  • (String) link — The link.

Returns:

  • (Boolean) — Specifies whether the given link should be visited.


423
424
425
# File 'lib/spidr/filters.rb', line 423

def visit_link?(link)
  @link_rules.accept?(link)
end

Specifies the patterns that match the links to visit.

Returns:

  • (Array<String, Regexp, Proc>) — The link patterns to visit.


247
248
249
# File 'lib/spidr/filters.rb', line 247

def visit_links
  @link_rules.accept
end

Adds a given pattern to the visit_links.

Parameters:

  • (String, Regexp) pattern (defaults to: nil) — The pattern to match links with.

Yields:

  • (link) — If a block is given, it will be used to filter links.

Yield Parameters:

  • (String) link — A link to accept or reject.


263
264
265
266
267
268
269
270
271
# File 'lib/spidr/filters.rb', line 263

def visit_links_like(pattern=nil,&block)
  if pattern
    visit_links << pattern
  elsif block
    visit_links << block
  end

  return self
end

- (Boolean) visit_port?(port) (protected)

Determines if a given port should be visited.

Parameters:

  • (Integer) port — The port number.

Returns:

  • (Boolean) — Specifies whether the given port should be visited.


410
411
412
# File 'lib/spidr/filters.rb', line 410

def visit_port?(port)
  @port_rules.accept?(port)
end

- (Array<Integer, Regexp, Proc>) visit_ports

Specifies the patterns that match the ports to visit.

Returns:

  • (Array<Integer, Regexp, Proc>) — The port patterns to visit.


183
184
185
# File 'lib/spidr/filters.rb', line 183

def visit_ports
  @port_rules.accept
end

- (Object) visit_ports_like(pattern = nil, &block) {|port| ... }

Adds a given pattern to the visit_ports.

Parameters:

  • (Integer, Regexp) pattern (defaults to: nil) — The pattern to match ports with.

Yields:

  • (port) — If a block is given, it will be used to filter ports.

Yield Parameters:

  • (Integer) port — A port to accept or reject.


199
200
201
202
203
204
205
206
207
# File 'lib/spidr/filters.rb', line 199

def visit_ports_like(pattern=nil,&block)
  if pattern
    visit_ports << pattern
  elsif block
    visit_ports << block
  end

  return self
end

- (Boolean) visit_scheme?(scheme) (protected)

Determines if a given URI scheme should be visited.

Parameters:

  • (String) scheme — The URI scheme.

Returns:

  • (Boolean) — Specifies whether the given scheme should be visited.


380
381
382
383
384
385
386
# File 'lib/spidr/filters.rb', line 380

def visit_scheme?(scheme)
  if scheme
    return @schemes.include?(scheme)
  else
    return true
  end
end