Module: Spidr::Filters
- Defined in:
- lib/spidr/filters.rb
Class Method Summary
Instance Method Summary
- - (Array<String, Regexp, Proc>) ignore_exts Specifies the patterns that match URI path extensions to not visit.
- - (Object) ignore_exts_like(pattern = nil, &block) {|ext| ... } Adds a given pattern to the ignore_exts.
- - (Array<String, Regexp, Proc>) ignore_hosts Specifies the patterns that match host-names to not visit.
- - (Object) ignore_hosts_like(pattern = nil, &block) {|host| ... } Adds a given pattern to the ignore_hosts.
- - (Array<String, Regexp, Proc>) ignore_links Specifies the patterns that match links to not visit.
- - (Object) ignore_links_like(pattern = nil, &block) {|link| ... } Adds a given pattern to the ignore_links.
- - (Array<Integer, Regexp, Proc>) ignore_ports Specifies the patterns that match ports to not visit.
- - (Object) ignore_ports_like(pattern = nil, &block) {|port| ... } Adds a given pattern to the ignore_ports.
- - (Filters) initialize(options = {}) Initializes filtering rules.
- - (Object) schemes=(new_schemes) Sets the list of acceptable URL schemes to visit.
- - (Boolean) visit_ext?(path) protected Determines if a given URI path extension should be visited.
- - (Array<String, Regexp, Proc>) visit_exts Specifies the patterns that match the URI path extensions to visit.
- - (Object) visit_exts_like(pattern = nil, &block) {|ext| ... } Adds a given pattern to the visit_exts.
- - (Boolean) visit_host?(host) protected Determines if a given host-name should be visited.
- - (Array<String, Regexp, Proc>) visit_hosts Specifies the patterns that match host-names to visit.
- - (Object) visit_hosts_like(pattern = nil, &block) {|host| ... } Adds a given pattern to the visit_hosts.
- - (Boolean) visit_link?(link) protected Determines if a given link should be visited.
- - (Array<String, Regexp, Proc>) visit_links Specifies the patterns that match the links to visit.
- - (Object) visit_links_like(pattern = nil, &block) {|link| ... } Adds a given pattern to the visit_links.
- - (Boolean) visit_port?(port) protected Determines if a given port should be visited.
- - (Array<Integer, Regexp, Proc>) visit_ports Specifies the patterns that match the ports to visit.
- - (Object) visit_ports_like(pattern = nil, &block) {|port| ... } Adds a given pattern to the visit_ports.
- - (Boolean) visit_scheme?(scheme) protected Determines if a given URI scheme should be visited.
Class Method Details
+ (Object) included(base)
5 6 7 8 9 10 |
# File 'lib/spidr/filters.rb', line 5 def self.included(base) base.module_eval do # List of acceptable URL schemes to follow attr_reader :schemes end end |
Instance Method Details
- (Array<String, Regexp, Proc>) ignore_exts
Specifies the patterns that match URI path extensions to not visit.
343 344 345 |
# File 'lib/spidr/filters.rb', line 343 def ignore_exts @ext_rules.reject end |
- (Object) ignore_exts_like(pattern = nil, &block) {|ext| ... }
Adds a given pattern to the ignore_exts.
359 360 361 362 363 364 365 366 367 |
# File 'lib/spidr/filters.rb', line 359 def ignore_exts_like(pattern=nil,&block) if pattern ignore_exts << pattern elsif block ignore_exts << block end return self end |
- (Array<String, Regexp, Proc>) ignore_hosts
Specifies the patterns that match host-names to not visit.
151 152 153 |
# File 'lib/spidr/filters.rb', line 151 def ignore_hosts @host_rules.reject end |
- (Object) ignore_hosts_like(pattern = nil, &block) {|host| ... }
Adds a given pattern to the ignore_hosts.
167 168 169 170 171 172 173 174 175 |
# File 'lib/spidr/filters.rb', line 167 def ignore_hosts_like(pattern=nil,&block) if pattern ignore_hosts << pattern elsif block ignore_hosts << block end return self end |
- (Array<String, Regexp, Proc>) ignore_links
Specifies the patterns that match links to not visit.
279 280 281 |
# File 'lib/spidr/filters.rb', line 279 def ignore_links @link_rules.reject end |
- (Object) ignore_links_like(pattern = nil, &block) {|link| ... }
Adds a given pattern to the ignore_links.
295 296 297 298 299 300 301 302 303 |
# File 'lib/spidr/filters.rb', line 295 def ignore_links_like(pattern=nil,&block) if pattern ignore_links << pattern elsif block ignore_links << block end return self end |
- (Array<Integer, Regexp, Proc>) ignore_ports
Specifies the patterns that match ports to not visit.
215 216 217 |
# File 'lib/spidr/filters.rb', line 215 def ignore_ports @port_rules.reject end |
- (Object) ignore_ports_like(pattern = nil, &block) {|port| ... }
Adds a given pattern to the ignore_ports.
231 232 233 234 235 236 237 238 239 |
# File 'lib/spidr/filters.rb', line 231 def ignore_ports_like(pattern=nil,&block) if pattern ignore_ports << pattern elsif block ignore_ports << block end return self end |
- (Filters) initialize(options = {})
Initializes filtering rules.
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
# File 'lib/spidr/filters.rb', line 49 def initialize(={}) super() @schemes = [] if [:schemes] @schemes += [:schemes] else @schemes << 'http' begin require 'net/https' @schemes << 'https' rescue Gem::LoadError => e raise(e) rescue ::LoadError STDERR.puts "Warning: cannot load 'net/https', https support disabled" end end @host_rules = Rules.new( :accept => [:hosts], :reject => [:ignore_hosts] ) @port_rules = Rules.new( :accept => [:ports], :reject => [:ignore_ports] ) @link_rules = Rules.new( :accept => [:links], :reject => [:ignore_links] ) @ext_rules = Rules.new( :accept => [:exts], :reject => [:ignore_exts] ) if [:host] visit_hosts_like([:host]) end if [:queue] self.queue = [:queue] end if [:history] self.history = [:history] end end |
- (Object) schemes=(new_schemes)
Sets the list of acceptable URL schemes to visit.
109 110 111 |
# File 'lib/spidr/filters.rb', line 109 def schemes=(new_schemes) @schemes = new_schemes.map { |scheme| scheme.to_s } end |
- (Boolean) visit_ext?(path) (protected)
Determines if a given URI path extension should be visited.
436 437 438 |
# File 'lib/spidr/filters.rb', line 436 def visit_ext?(path) @ext_rules.accept?(File.extname(path)[1..-1]) end |
- (Array<String, Regexp, Proc>) visit_exts
Specifies the patterns that match the URI path extensions to visit.
311 312 313 |
# File 'lib/spidr/filters.rb', line 311 def visit_exts @ext_rules.accept end |
- (Object) visit_exts_like(pattern = nil, &block) {|ext| ... }
Adds a given pattern to the visit_exts.
327 328 329 330 331 332 333 334 335 |
# File 'lib/spidr/filters.rb', line 327 def visit_exts_like(pattern=nil,&block) if pattern visit_exts << pattern elsif block visit_exts << block end return self end |
- (Boolean) visit_host?(host) (protected)
Determines if a given host-name should be visited.
397 398 399 |
# File 'lib/spidr/filters.rb', line 397 def visit_host?(host) @host_rules.accept?(host) end |
- (Array<String, Regexp, Proc>) visit_hosts
Specifies the patterns that match host-names to visit.
119 120 121 |
# File 'lib/spidr/filters.rb', line 119 def visit_hosts @host_rules.accept end |
- (Object) visit_hosts_like(pattern = nil, &block) {|host| ... }
Adds a given pattern to the visit_hosts.
135 136 137 138 139 140 141 142 143 |
# File 'lib/spidr/filters.rb', line 135 def visit_hosts_like(pattern=nil,&block) if pattern visit_hosts << pattern elsif block visit_hosts << block end return self end |
- (Boolean) visit_link?(link) (protected)
Determines if a given link should be visited.
423 424 425 |
# File 'lib/spidr/filters.rb', line 423 def visit_link?(link) @link_rules.accept?(link) end |
- (Array<String, Regexp, Proc>) visit_links
Specifies the patterns that match the links to visit.
247 248 249 |
# File 'lib/spidr/filters.rb', line 247 def visit_links @link_rules.accept end |
- (Object) visit_links_like(pattern = nil, &block) {|link| ... }
Adds a given pattern to the visit_links.
263 264 265 266 267 268 269 270 271 |
# File 'lib/spidr/filters.rb', line 263 def visit_links_like(pattern=nil,&block) if pattern visit_links << pattern elsif block visit_links << block end return self end |
- (Boolean) visit_port?(port) (protected)
Determines if a given port should be visited.
410 411 412 |
# File 'lib/spidr/filters.rb', line 410 def visit_port?(port) @port_rules.accept?(port) end |
- (Array<Integer, Regexp, Proc>) visit_ports
Specifies the patterns that match the ports to visit.
183 184 185 |
# File 'lib/spidr/filters.rb', line 183 def visit_ports @port_rules.accept end |
- (Object) visit_ports_like(pattern = nil, &block) {|port| ... }
Adds a given pattern to the visit_ports.
199 200 201 202 203 204 205 206 207 |
# File 'lib/spidr/filters.rb', line 199 def visit_ports_like(pattern=nil,&block) if pattern visit_ports << pattern elsif block visit_ports << block end return self end |
- (Boolean) visit_scheme?(scheme) (protected)
Determines if a given URI scheme should be visited.
380 381 382 383 384 385 386 |
# File 'lib/spidr/filters.rb', line 380 def visit_scheme?(scheme) if scheme return @schemes.include?(scheme) else return true end end |