-
Notifications
You must be signed in to change notification settings - Fork 12
/
WebScrapingandRectangling(1).R
348 lines (227 loc) · 9.11 KB
/
WebScrapingandRectangling(1).R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
install.packages("rvest")
library(rvest)
install.packages("repurrrsive")
library(repurrrsive)
library(dplyr)
library(tidyverse)
install.packages("listviewer")
listviewer::jsonedit(gh_users)
# RECTANGLING (Transforming a complicated list, called a nested list,
# into a data frame the is easier to interpret and process)
# Nested Lists -> lists withing a list.
# Here is a basic example that we studied previously
# X = list(23, 3.01, "alpha", b = (100, 2.033, "rail", NA)). Note that we
# we have a list within a list. List b is nested in List X
# Our cases and examples will be more involved but the basic idea is the
# same.
# Let's look at a nested list involving 6 Github users
(users <- tibble(user = gh_users))
# Lets take a look at the list data for each Github user
listviewer::jsonedit(gh_users)
# Now use the following code to print out the 30 profile characteristics
# in the console
names(users$user[[1]])
# Now transform the nested data into a data frames (A representation good for analysis)
users %>%
unnest_wider(user)
# Note that the characteristics have become column variables
users %>%
unnest_longer(user) # not a good representation. The wider option
# is better.
# You may not want all of the column variables. You can use the command
# hoist to select the variables and list content that you want.
users %>% hoist(user,
followers = "followers",
login = "login",
url = "html_url")
# Game of Thrones
got_chars # Lists regarding Game of Thrones characters that provides no
# special structure or organization
# Let's organize the data into a tibble (18 characteristics for 30 characters)
chars <- tibble(char = got_chars)
chars
# And now look at the data Lets look at the specific data for the 30 names
listviewer::jsonedit(got_chars) #1, #8, #17 #22
# Now transform the nested data into a data frame
chars2 <- chars %>% unnest_wider(char)
chars2
# This is more complex than gh_users because some components of char are
# themselves a list, giving us a collection of list-columns:
chars2 %>% select_if(is.list) # check out the different aliases for person 1 in the data table)
# What you do next will depend on the purposes of the analysis. Maybe you
# want a row for every book and TV series that the character appears in:
chars2 %>%
select(name, books, tvSeries) %>%
pivot_longer(c(books, tvSeries), names_to = "media", values_to = "value") %>%
unnest_longer(value)
# Or maybe you want to build a table that lets you match title to name:
chars2 %>%
select(name, title = titles) %>%
unnest_longer(title)
# WEB SCRAPING
# Coding Structure:
Link <- " "
page = read_html(Link)
Assigned Name = page %>%
html_nodes(" ")%>%
html_text()
Assigned Name
# Example 1
# First, Let's scrape all movienames from the sight
Link <- "https://www.imdb.com/search/title/?genres=action&groups=top_250&sort=user_rating,desc"
page = read_html(Link)
Movienames = page%>% html_nodes(".lister-item-header a ")%>%
html_text()
Movienames
# Now, Let's scrape all years that a movie was initially featured
Link <- "https://www.imdb.com/search/title/?genres=action&groups=top_250&sort=user_rating,desc"
page = read_html(Link)
years = page%>% html_nodes(" .text-muted.unbold ")%>%
html_text()
years
# Finally, we scrape the movieratings from the page
Link <- "https://www.imdb.com/search/title/?genres=action&groups=top_250&sort=user_rating,desc"
page = read_html(Link)
Movieratings = page%>% html_nodes(".ratings-imdb-rating strong")%>%
html_text()
Movieratings
# Let's organize our collected data onto a data frame.
moviesdataframe = data.frame(Movienames,years,Movieratings)
moviesdataframe
# Now let's improve the format and appearance of the data table by transforming
# into a tibble
is_tibble(moviesdataframe)
as_tibble(moviesdataframe)
# Example 2
# Washington DC
# We first scrape daily temperatures from the page
Link <- "https://forecast.weather.gov/MapClick.php?CityName=Washington&state=DC&site=LWX&textField1=38.895&textField2=-77.0373&e=1#.YLPCyflKiUk"
page = read_html(Link)
Temperatures = page%>% html_nodes(".temp")%>%
html_text()
Temperatures
# Let's only output the numbers
parse_number(Temperatures)
# Let's create plots and summaries for the collected data
parse_number(Temperatures) -> y
y
mean(y)
summary(y)
boxplot(y)
# We next scrape daily Forcasts
Link <- "https://https://forecast.weather.gov/MapClick.php?CityName=Washington&state=DC&site=LWX&textField1=38.895&textField2=-77.0373&e=1#.YLPCyflKiUk"
page = read_html(Link)
Forcasts = page%>% html_nodes(" .short-desc ")%>%
html_text()
Forcasts
# Let's create a data frame
Weather = data.frame(Temperatures, Forcasts)
Weather
# The data frame has a good structure/display. No need to change to
# a tibble.
# Suppose we decide to scrape more data.Let's incorporate the Day Night designation
Link <- "https://https://forecast.weather.gov/MapClick.php?CityName=Washington&state=DC&site=LWX&textField1=38.895&textField2=-77.0373&e=1#.YLPCyflKiUk"
page = read_html(Link)
DayNight = page%>% html_nodes(".period-name ")%>%
html_text()
DayNight
# New Weather table
Weather = data.frame(DayNight, Temperatures, Forcasts)
Weather
# Web Scraping : Using an API
install.packages("tidycensus") # An R package used to retrieve information from
# the Census Bureau.
library(tidycensus)
library(tidyverse)
library(dplyr)
library(ggplot2)
# Web Scraping using an API
# What is an API ?
# API stands for Application Programming Interface, which is
# a software intermediary that allows two applications to talk to each
# other. Each time you use an app like Facebook, send an instant
# message, or check the weather on your phone, you're using an API.
# Informally, an API is code, software, an electronic system, that
# makes it possible for information or data to be retrieved and
# interpreted.
# The link to an excellent post describing API's in more detail is
# given below
# https://www.mulesoft.com/resources/api/what-is-an-api
# In order to obtain the services of an organization's API, it is
# common to use what is called an API key. An API key is a code
# identifier that gives the user access.
# We will use an API key in the process of retrieving data from the
# US Census Bureau. We will use the R package tidycensus
# Use this link to get a census API Key
# http://api.census.gov/data/key_signup.html
# Below is my API key, along with code needed for activation.
census_api_key("2153faf1e1d25707ef71c8c464cb4c2c08be4e76")
install = TRUE
# From the US census data bank, we will get the median age by state
# for 2010.
# The following link is to be used to get information on the Basic
# usage of tidycenus
# https://walker-data.com/tidycensus/
# Use the following link to get variable codes and their descriptions.
# https://api.census.gov/data/2010/dec/sf1/variables.html
# Using Census data to get the median age by sex in 2010
age10 <- get_decennial(geography = "state",
variables = "P013001",
year = 2010)
age10
# Now use R code to find the mean age for all states
mean(age10$value)
# Total females by state who were 17 years old in 2010
age17_10 <- get_decennial(geography = "state",
variables = "P014031",
year = 2010)
age17_10
# We can now use R coding to get summary numbers on the variable
mean(age17_10$value)
max(age17_10$value)
summary(age17_10$value)
# ACS Data
# ACS (American Community Survey) Based on a sample of 3 million
# households. Not as comprehensive or complete as the standard
# US census.
# We will focus on the 5 year ACS data
# Let's find median household income data from 2014 to 2018 for
# counties in the state of Vermont.
# We need to find variable codes. Specifically, lets find data for 2017
v17 <- load_variables(2017, "acs5", cache = TRUE)
View(v17)
vt <- get_acs(geography = "county",
variables = c(mediancome = "B19013_001"),
state = "VT",
year = 2017)
vt
# moe (margin of error)
mean(vt$moe)
median(vt$estimate)
?gsub
vt %>%
mutate(NAME = gsub(" County, Vermont", "", NAME)) %>%
ggplot(aes(x = estimate, y = reorder(NAME, estimate))) +
geom_errorbarh(aes(xmin = estimate - moe, xmax = estimate + moe)) +
geom_point(color = "red", size = 3) +
labs(title = "Household income by county in Vermont",
subtitle = "2014-2018 American Community Survey",
y = "",
x = "ACS estimate (bars represent margin of error)")
# Let's generate a similar table and graph for Pennsylvania.
pa <- get_acs(geography = "county",
variables = c(medincome = "B19013_001"),
state = "PA",
year = 2017)
pa
pa %>%
mutate(NAME = gsub(" County, Pennsylvania", "", NAME)) %>%
ggplot(aes(x = estimate, y = reorder(NAME, estimate))) +
geom_errorbarh(aes(xmin = estimate - moe, xmax = estimate + moe)) +
geom_point(color = "green", size = 3) +
labs(title = "Household income by county in Vermont",
subtitle = "2014-2018 American Community Survey",
y = "",
x = "ACS estimate (bars represent margin of error)")
q()
y