-
Notifications
You must be signed in to change notification settings - Fork 0
/
Rakefile
110 lines (86 loc) · 3.13 KB
/
Rakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# frozen_string_literal: true
require 'csv'
require 'nokogiri'
require 'open-uri'
require './scrapers/character_scraper'
require './scrapers/potion_scraper'
require './scrapers/spell_scraper'
base_url = 'https://harrypotter.fandom.com'
namespace :scrabby do
desc 'Scrape characters from the Harry Potter Wiki'
task :characters do
puts 'Start scraping characters...'
character_links = []
genders = %w[Females Males Individuals_of_unknown_or_undetermined_gender]
genders.each do |gender|
scrape_character_links("#{base_url}/wiki/Category:#{gender}", base_url, character_links)
end
characters = []
character_links.sort!.each do |url|
characters << CharacterScraper.scrape_page(url, characters)
end
CSV.open('data/characters.csv', 'w') do |csv|
csv << characters.first.keys
characters.each { |character| csv << character.values }
end
puts "Scraped #{characters.count} characters!"
end
desc 'Scrape potions from the Harry Potter Wiki'
task :potions do
puts 'Start scraping potions...'
potion_links = []
scrape_potion_links(base_url, potion_links)
potions = []
potion_links.sort!.each do |url|
potions << PotionScraper.scrape_page(url, potions)
end
CSV.open('data/potions.csv', 'w') do |csv|
csv << potions.first.keys
potions.each { |potion| csv << potion.values }
end
end
desc 'Scrape spells from the Harry Potter Wiki'
task :spells do
puts 'Start scraping spells...'
spell_links = []
scrape_spell_links(base_url, spell_links)
spells = []
spell_links.sort!.each do |url|
spells << SpellScraper.scrape_page(url, spells)
end
CSV.open('data/spells.csv', 'w') do |csv|
csv << spells.first.keys
spells.each { |spell| csv << spell.values }
end
end
end
def scrape_character_links(url, base_url, links)
file = URI.parse(url).open.read
doc = Nokogiri::HTML(file)
doc.search('.category-page__member-link').each do |element|
character_link = base_url + element.attribute('href').value
links << character_link unless links.include?(character_link)
end
next_page = doc.search('.category-page__pagination-next').first
scrape_character_links(next_page.attribute('href').value, base_url, links) if next_page
end
def scrape_potion_links(base_url, links)
file = URI.parse("#{base_url}/wiki/List_of_potions").open.read
doc = Nokogiri::HTML(file)
doc.search('div.wds-tab__content > ul > li > a').each do |element|
potion_link = CGI.unescape(base_url + element.attribute('href').value)
links << potion_link unless links.include?(potion_link)
end
doc.search('div.wds-tab__content > li > a').each do |element|
potion_link = CGI.unescape(base_url + element.attribute('href').value)
links << potion_link unless links.include?(potion_link)
end
end
def scrape_spell_links(base_url, links)
file = URI.parse("#{base_url}/wiki/List_of_spells").open.read
doc = Nokogiri::HTML(file)
doc.search('.wds-tab__content > h3 > span a').each do |element|
spell_link = base_url + element.attribute('href').value
links << spell_link unless links.include?(spell_link)
end
end