S

SpaceScrape

Web scraping for the greater space... (like... outer space)

Trello board Used to track progress, and possible ideas/extensions.

SpaceScrape is a basic little experiment of mine to build out a bit of an automated researcher, and to mess around with scrapers/crawlers.

Scraper Worker: timed_out_domains = Redis.keys 'timeout:*').map{ |k| k.gsub(/timeout:/, '') } webpage = Webpage.find(WHERE to_scrape = true AND domain NOT IN timed_out_domains ORDER BY priority)

return and notify(:finished) if (webpage.last_scrape.age is within LAST_SCRAPE_AGE)

scrape_url(url: webpage.url) -> return and notify(:skipped) if blacklisted?(url) return and notify(:skipped) unless robots.allowed?(url)

results = get(url)

webpage.raw = results
webpage.content = extract(raw: results)

scrape = webpage.create_scrape

notify(:finished)

Relevancy And Link Worker: on(:finished) -> jobs = Job.join(Relevance).join(Scrape, with: scrape).where(Relevance IS NULL OR Scrape IS NULL AND Job.finished IS NULL)

jobs.each(job) ->
  return if job.expired?

  relevance = calculate_relevance(for: webpage, related_to: job)

  links = extract_links(from: webpage.raw)

  links.each(link_url) ->
    link_webpage = Webpage.find_or_create(url: link_url) ->
    link_link    = Link.find_or_create(from: url, to: link_url, scrape: scrape)

    if link_webpage.new? and relevance is over job.relevance_threshold
      to_scrape = true
      priority = 10

Job Worker: jobs = Job.where(finished IS NULL AND report_date < NOW)

jobs.each(job) ->