<- read_csv("data/hotel_bookings.csv",
df col_names = TRUE,
show_col_types = FALSE)
# Surpressing summarize info
options(dplyr.summarise.inform = FALSE)
7 酒店房间预定预测
7.1 读取数据
7.2 数据总览
glimpse(df)
Rows: 119,390
Columns: 32
$ hotel <chr> "Resort Hotel", "Resort Hotel", "Resort…
$ is_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
$ lead_time <dbl> 342, 737, 7, 13, 14, 14, 0, 9, 85, 75, …
$ arrival_date_year <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 201…
$ arrival_date_month <chr> "July", "July", "July", "July", "July",…
$ arrival_date_week_number <dbl> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,…
$ arrival_date_day_of_month <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ stays_in_weekend_nights <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ stays_in_week_nights <dbl> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, …
$ adults <dbl> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
$ children <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ babies <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ meal <chr> "BB", "BB", "BB", "BB", "BB", "BB", "BB…
$ country <chr> "PRT", "PRT", "GBR", "GBR", "GBR", "GBR…
$ market_segment <chr> "Direct", "Direct", "Direct", "Corporat…
$ distribution_channel <chr> "Direct", "Direct", "Direct", "Corporat…
$ is_repeated_guest <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_cancellations <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ reserved_room_type <chr> "C", "C", "A", "A", "A", "A", "C", "C",…
$ assigned_room_type <chr> "C", "C", "C", "A", "A", "A", "C", "C",…
$ booking_changes <dbl> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ deposit_type <chr> "No Deposit", "No Deposit", "No Deposit…
$ agent <chr> "NULL", "NULL", "NULL", "304", "240", "…
$ company <chr> "NULL", "NULL", "NULL", "NULL", "NULL",…
$ days_in_waiting_list <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ customer_type <chr> "Transient", "Transient", "Transient", …
$ adr <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98.00,…
$ required_car_parking_spaces <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ total_of_special_requests <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 3, …
$ reservation_status <chr> "Check-Out", "Check-Out", "Check-Out", …
$ reservation_status_date <date> 2015-07-01, 2015-07-01, 2015-07-02, 20…
7.2.1 数据清洗
在快速浏览数据集以初步了解变量及其格式后,下一步是检查每列中缺失值的数量及其各自的大小。
将”NULL”转换为NA
,将”CN”转换为CHN
。
# 1.将数据集中的date列临时转换为字符串,便于将NULL转换为NA
<- transform(
dftemp1
df,reservation_status_date =
as.character(reservation_status_date))
# 2. 将NULL转化为NA,将CN转换为CHN
== "NULL"] <- NA
dftemp1[dftemp1 == "CN"] <- "CHN"
dftemp1[dftemp1
# 3. 除 iso3c 代码外,还添加国家/地区名称、大洲和地区
$country_name <- countrycode(
dftemp1$country, "iso3c", "country.name"
dftemp1
)$continent <- countrycode(
dftemp1$country, "iso3c", "continent"
dftemp1
)$region <- countrycode(
dftemp1$country, "iso3c", "region23"
dftemp1
)
# 4. 将date列变会date
<-
dftemp2 transform(dftemp1,
reservation_status_date =
as.Date(reservation_status_date))
# 5. 计算缺失值数量
miss_var_summary(dftemp2) %>%
filter(n_miss != 0)
# A tibble: 7 × 3
variable n_miss pct_miss
<chr> <int> <dbl>
1 company 112593 94.3
2 agent 16340 13.7
3 continent 495 0.415
4 region 495 0.415
5 country_name 491 0.411
6 country 488 0.409
7 children 4 0.00335
我们发现,
company
列中有112593个缺失值,占该列总数据的95%左右,只是单纯的将该列缺失值删除显然不太合适。我们将country,country_name,continent,agent和company,等NAs替换为
None
,将children替换为0。
7.2.2 缺失值处理
# 1. 将所有NA替换为0
is.na(dftemp2)] <- 0
dftemp2[
# 2. 将country, agent and company 列中的0值替换为 "None"
<- dftemp2 %>%
dftemp3 mutate(country = replace(country,
== 0,
country "None"),
agent = replace(agent,
== 0,
agent "None"),
company = replace(company,
== 0,
company "None"),
country_name = replace(country_name,
== "0",
country_name "None")
)glimpse(dftemp3)
Rows: 119,390
Columns: 35
$ hotel <chr> "Resort Hotel", "Resort Hotel", "Resort…
$ is_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
$ lead_time <dbl> 342, 737, 7, 13, 14, 14, 0, 9, 85, 75, …
$ arrival_date_year <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 201…
$ arrival_date_month <chr> "July", "July", "July", "July", "July",…
$ arrival_date_week_number <dbl> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,…
$ arrival_date_day_of_month <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ stays_in_weekend_nights <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ stays_in_week_nights <dbl> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, …
$ adults <dbl> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
$ children <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ babies <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ meal <chr> "BB", "BB", "BB", "BB", "BB", "BB", "BB…
$ country <chr> "PRT", "PRT", "GBR", "GBR", "GBR", "GBR…
$ market_segment <chr> "Direct", "Direct", "Direct", "Corporat…
$ distribution_channel <chr> "Direct", "Direct", "Direct", "Corporat…
$ is_repeated_guest <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_cancellations <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ reserved_room_type <chr> "C", "C", "A", "A", "A", "A", "C", "C",…
$ assigned_room_type <chr> "C", "C", "C", "A", "A", "A", "C", "C",…
$ booking_changes <dbl> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ deposit_type <chr> "No Deposit", "No Deposit", "No Deposit…
$ agent <chr> "None", "None", "None", "304", "240", "…
$ company <chr> "None", "None", "None", "None", "None",…
$ days_in_waiting_list <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ customer_type <chr> "Transient", "Transient", "Transient", …
$ adr <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98.00,…
$ required_car_parking_spaces <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ total_of_special_requests <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 3, …
$ reservation_status <chr> "Check-Out", "Check-Out", "Check-Out", …
$ reservation_status_date <date> 2015-07-01, 2015-07-01, 2015-07-02, 20…
$ country_name <chr> "Portugal", "Portugal", "United Kingdom…
$ continent <chr> "Europe", "Europe", "Europe", "Europe",…
$ region <chr> "Southern Europe", "Southern Europe", "…
7.2.3 数据整合
增加四个新列:
- total_nights:总住宿天数
- total_rates:总房价
- total_guests:总房客
- total_nights_rates_guests
<- dftemp2 %>%
dftemp3 mutate(
total_nights =
+
(stays_in_weekend_nights
stays_in_week_nights),total_rates =
* adr),
(total_nights total_guests = (adults + children +
babies),total_nights_rates_guests = (
+ total_guests +
total_nights
total_rates ))
- 增加arrival_month_year列。
方便可视化
<- transform(
dftemp4
dftemp3, arrival_date_year =
as.character(arrival_date_year)) %>%
mutate(
month_year_temp =
paste("1", arrival_date_month,
%>%
arrival_date_year)) mutate(
arrival_monty_year =
make_date(month_year_temp))
$arrival_monty_year <-
dftemp4dmy(dftemp4$month_year_temp)
7.2.4 数据完整新检查
计算没有夜晚、房价或客人记录的观测值数量.
%>%
dftemp4 filter(total_nights_rates_guests == 0) %>%
count()
n
1 70
下一步是通过过滤掉没有夜晚、房价和客人记录的 70 行并删除total_nights_rate_guests变量,最终确定数据集以准备进行分析。
7.2.5 数据定稿
<- dftemp4 %>%
dfc filter(total_nights_rates_guests != 0) %>%
select(., -total_nights_rates_guests,
-month_year_temp)
glimpse(dfc)
Rows: 119,320
Columns: 39
$ hotel <chr> "Resort Hotel", "Resort Hotel", "Resort…
$ is_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
$ lead_time <dbl> 342, 737, 7, 13, 14, 14, 0, 9, 85, 75, …
$ arrival_date_year <chr> "2015", "2015", "2015", "2015", "2015",…
$ arrival_date_month <chr> "July", "July", "July", "July", "July",…
$ arrival_date_week_number <dbl> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,…
$ arrival_date_day_of_month <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ stays_in_weekend_nights <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ stays_in_week_nights <dbl> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, …
$ adults <dbl> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
$ children <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ babies <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ meal <chr> "BB", "BB", "BB", "BB", "BB", "BB", "BB…
$ country <chr> "PRT", "PRT", "GBR", "GBR", "GBR", "GBR…
$ market_segment <chr> "Direct", "Direct", "Direct", "Corporat…
$ distribution_channel <chr> "Direct", "Direct", "Direct", "Corporat…
$ is_repeated_guest <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_cancellations <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ reserved_room_type <chr> "C", "C", "A", "A", "A", "A", "C", "C",…
$ assigned_room_type <chr> "C", "C", "C", "A", "A", "A", "C", "C",…
$ booking_changes <dbl> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ deposit_type <chr> "No Deposit", "No Deposit", "No Deposit…
$ agent <chr> "0", "0", "0", "304", "240", "240", "0"…
$ company <chr> "0", "0", "0", "0", "0", "0", "0", "0",…
$ days_in_waiting_list <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ customer_type <chr> "Transient", "Transient", "Transient", …
$ adr <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98.00,…
$ required_car_parking_spaces <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ total_of_special_requests <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 3, …
$ reservation_status <chr> "Check-Out", "Check-Out", "Check-Out", …
$ reservation_status_date <date> 2015-07-01, 2015-07-01, 2015-07-02, 20…
$ country_name <chr> "Portugal", "Portugal", "United Kingdom…
$ continent <chr> "Europe", "Europe", "Europe", "Europe",…
$ region <chr> "Southern Europe", "Southern Europe", "…
$ total_nights <dbl> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, …
$ total_rates <dbl> 0.00, 0.00, 75.00, 75.00, 196.00, 196.0…
$ total_guests <dbl> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
$ arrival_monty_year <date> 2015-07-01, 2015-07-01, 2015-07-01, 20…
7.3 数据初步探索
descr(dfc) %>%
round(., 2) %>%
t()
Descriptive Statistics
dfc
N: 119320
Mean Std.Dev Min Q1 Median Q3
------------------------------------ -------- --------- -------- -------- -------- --------
adr 101.89 50.49 -6.38 69.36 94.80 126.00
adults 1.86 0.58 0.00 2.00 2.00 2.00
arrival_date_day_of_month 15.80 8.78 1.00 8.00 16.00 23.00
arrival_date_week_number 27.16 13.60 1.00 16.00 28.00 38.00
babies 0.01 0.10 0.00 0.00 0.00 0.00
booking_changes 0.22 0.65 0.00 0.00 0.00 0.00
children 0.10 0.40 0.00 0.00 0.00 0.00
days_in_waiting_list 2.32 17.60 0.00 0.00 0.00 0.00
is_canceled 0.37 0.48 0.00 0.00 0.00 1.00
is_repeated_guest 0.03 0.17 0.00 0.00 0.00 0.00
lead_time 104.07 106.87 0.00 18.00 69.00 160.00
previous_bookings_not_canceled 0.14 1.50 0.00 0.00 0.00 0.00
previous_cancellations 0.09 0.84 0.00 0.00 0.00 0.00
required_car_parking_spaces 0.06 0.25 0.00 0.00 0.00 0.00
stays_in_week_nights 2.50 1.91 0.00 1.00 2.00 3.00
stays_in_weekend_nights 0.93 1.00 0.00 0.00 1.00 2.00
total_guests 1.97 0.72 0.00 2.00 2.00 2.00
total_nights 3.43 2.56 0.00 2.00 3.00 4.00
total_of_special_requests 0.57 0.79 0.00 0.00 0.00 1.00
total_rates 358.06 335.90 -63.80 146.00 267.00 446.40
Table: Table continues below
Max MAD IQR CV Skewness SE.Skewness
------------------------------------ --------- -------- -------- ------- ---------- -------------
adr 5400.00 41.22 56.64 0.50 10.57 0.01
adults 55.00 0.00 0.00 0.31 18.49 0.01
arrival_date_day_of_month 31.00 11.86 15.00 0.56 0.00 0.01
arrival_date_week_number 53.00 16.31 22.00 0.50 -0.01 0.01
babies 10.00 0.00 0.00 12.25 24.64 0.01
booking_changes 21.00 0.00 0.00 2.95 6.00 0.01
children 10.00 0.00 0.00 3.84 4.11 0.01
days_in_waiting_list 391.00 0.00 0.00 7.58 11.94 0.01
is_canceled 1.00 0.00 1.00 1.30 0.54 0.01
is_repeated_guest 1.00 0.00 0.00 5.54 5.36 0.01
lead_time 737.00 88.96 142.00 1.03 1.35 0.01
previous_bookings_not_canceled 72.00 0.00 0.00 10.93 23.54 0.01
previous_cancellations 26.00 0.00 0.00 9.69 24.45 0.01
required_car_parking_spaces 8.00 0.00 0.00 3.92 4.16 0.01
stays_in_week_nights 50.00 1.48 2.00 0.76 2.86 0.01
stays_in_weekend_nights 19.00 1.48 2.00 1.08 1.38 0.01
total_guests 55.00 0.00 0.00 0.37 10.22 0.01
total_nights 69.00 1.48 2.00 0.75 3.31 0.01
total_of_special_requests 5.00 0.00 1.00 1.39 1.35 0.01
total_rates 7590.00 206.82 300.40 0.94 2.98 0.01
Table: Table continues below
Kurtosis N.Valid Pct.Valid
------------------------------------ ---------- ----------- -----------
adr 1017.33 119320.00 100.00
adults 1367.51 119320.00 100.00
arrival_date_day_of_month -1.19 119320.00 100.00
arrival_date_week_number -0.99 119320.00 100.00
babies 1632.90 119320.00 100.00
booking_changes 79.39 119320.00 100.00
children 18.66 119320.00 100.00
days_in_waiting_list 186.67 119320.00 100.00
is_canceled -1.71 119320.00 100.00
is_repeated_guest 26.76 119320.00 100.00
lead_time 1.70 119320.00 100.00
previous_bookings_not_canceled 767.15 119320.00 100.00
previous_cancellations 673.76 119320.00 100.00
required_car_parking_spaces 29.98 119320.00 100.00
stays_in_week_nights 24.31 119320.00 100.00
stays_in_weekend_nights 7.17 119320.00 100.00
total_guests 559.08 119320.00 100.00
total_nights 28.91 119320.00 100.00
total_of_special_requests 1.49 119320.00 100.00
total_rates 17.15 119320.00 100.00
7.3.1 按大洲划分的预订量和平均每晚花费
# 1. 计算酒店及大洲数据
<- dfc %>%
dfccharts group_by(hotel, continent) %>%
summarise(
total_rate = sum(total_rates),
bookings_count = n()
%>%
) mutate(
average_rate =
/ bookings_count) %>%
total_rate filter(continent != 0)
# 2. 仅计算大洲数据
<- dfc %>%
dfcchartscontonly group_by(continent) %>%
summarise(total_rate = sum(total_rates),
bookings_count = n()) %>%
mutate(average_rate =
/ bookings_count) %>%
total_rate filter(continent != 0)
# 3. 核定图形主题
<- theme(
Theme1 legend.position = "top",
legend.title = element_blank(),
plot.title = element_text(size = 15),
axis.title = element_blank(),
axis.text.x = element_text(size = 5),
axis.text.y = element_text(size = 5),
legend.text = element_text(size = 10),
legend.spacing.x = unit(4, "mm"),
legend.justification = "left"
)
# 4. 作图
<- ggplot(
plot1a aes(x = reorder(continent,
dfccharts, -bookings_count),
y = bookings_count)) +
geom_col(fill = "steelblue") +
scale_y_continuous(labels = unit_format(
suffix = "K", scale = 0.001
+
)) labs(title = "各大洲预定总数") +
Theme1
<- ggplot(
plot2a aes(x = reorder(continent,
dfccharts, -bookings_count),
y = bookings_count,
fill = hotel)) +
geom_bar(position = "dodge", stat = "identity") +
scale_fill_manual(
values = c("City Hotel" = "steelblue3",
"Resort Hotel" = "steelblue4")
+
) scale_y_continuous(
labels = unit_format(suffix = "K", scale = 0.001)) +
labs(title = "按照城市酒店和度假酒店细分") +
Theme1
<- ggplot(
plot3a aes(x = reorder(continent,
dfcchartscontonly, -average_rate),
y = average_rate)) +
geom_col(fill = "steelblue") +
scale_y_continuous(labels = unit_format(
perfix = "$", suffix = ""
+
)) labs(title = "各大洲每天预定酒店平均消费") +
Theme1
<- ggplot(dfccharts,
plot4a aes(x = continent,
y = average_rate,
fill = hotel)) +
geom_bar(position = "dodge", stat = "identity") +
scale_fill_manual(
values = c("City Hotel" = "steelblue3",
"Resort Hotel" = "steelblue4")
+
) labs(title = "按城市酒店和度假酒店细分") +
Theme1| plot3a) /(plot2a | plot4a) (plot1a
欧洲似乎是迄今为止最受欢迎的目的地,并且强烈偏爱城市酒店与度假村。
非洲是最昂贵的大陆,平均每晚住宿费用超过600美元。
除非洲外,度假酒店的平均花费更高。而美洲城市和度假酒店之间的平均房价似乎相似。
下面,我们深入到各个区域,以了解花费的具体不同。
7.3.2 欧洲
- 筛选欧洲大陆的数据-by region & hotel
<- dfc %>%
dfchotelregions filter(continent == "Europe") %>%
group_by(hotel, region) %>%
summarise(total_rate = sum(total_rates),
bookings_count = n()) %>%
mutate(average_rate = total_rate / bookings_count)
head(dfchotelregions)
# A tibble: 6 × 5
# Groups: hotel [2]
hotel region total_rate bookings_count average_rate
<chr> <chr> <dbl> <int> <dbl>
1 City Hotel Eastern Europe 745533. 1866 400.
2 City Hotel Northern Europe 3290622. 8565 384.
3 City Hotel Southern Europe 10209285. 39229 260.
4 City Hotel Western Europe 7750375. 20929 370.
5 Resort Hotel Eastern Europe 492552. 820 601.
6 Resort Hotel Northern Europe 4945907. 9745 508.
- 筛选欧洲大陆的数据-by region & hotel
<- dfc %>%
dfcregiononly filter(continent == "Europe") %>%
group_by(region) %>%
summarise(total_rate = sum(total_rates),
bookings_count = n()) %>%
mutate(average_rate = total_rate / bookings_count)
head(dfchotelregions)
# A tibble: 6 × 5
# Groups: hotel [2]
hotel region total_rate bookings_count average_rate
<chr> <chr> <dbl> <int> <dbl>
1 City Hotel Eastern Europe 745533. 1866 400.
2 City Hotel Northern Europe 3290622. 8565 384.
3 City Hotel Southern Europe 10209285. 39229 260.
4 City Hotel Western Europe 7750375. 20929 370.
5 Resort Hotel Eastern Europe 492552. 820 601.
6 Resort Hotel Northern Europe 4945907. 9745 508.
- 作图
<- ggplot(dfcregiononly,
plot1b aes(
reorder(region, -bookings_count),
+
bookings_count)) geom_col(fill = "steelblue") +
labs(title = "欧洲预定数量") +
Theme1
<- ggplot(dfcregiononly,
plot2b aes(reorder(region, -average_rate),
+
average_rate)) geom_col(fill = "steelblue") +
labs(title = "欧洲平均花销") +
Theme1
<- ggplot(dfchotelregions,
plot3b aes(reorder(region, -bookings_count),
bookings_count,fill = hotel)) +
geom_bar(position = "dodge", stat = "identity") +
scale_fill_manual(
values = c("City Hotel" = "steelblue3",
"Resort Hotel" = "steelblue4")) +
scale_y_continuous(
labels = unit_format(perfix = "$",
suffix = "")) +
labs(title = "按照城市和度假酒店细分") +
Theme1
<- ggplot(dfchotelregions,
plot4b aes(reorder(region, -average_rate),
average_rate,fill = hotel)) +
geom_bar(position = "dodge", stat = "identity") +
scale_fill_manual(
values = c("City Hotel" = "steelblue3",
"Resort Hotel" = "steelblue4")) +
scale_y_continuous(
labels = unit_format(prefix = "$",
suffix = "")
+
) labs(title = "按照城市和度假酒店细分") +
Theme1
| plot2b) / (plot3b | plot4b) (plot1b
7.3.3 美洲
<- dfc %>%
ame_hotel_regions filter(continent == "Americas") %>%
group_by(hotel, region) %>%
summarise(
total_rate = sum(total_rates),
bookings_count = n()) %>%
mutate(
average_rate = total_rate/bookings_count)
<- dfc %>%
ame_region filter(continent == "Americas") %>%
group_by(region) %>%
summarise(
total_rate = sum(total_rates),
bookings_count = n()
%>%
) mutate(
average_rate = total_rate/bookings_count)
<- ggplot(ame_region,
plot1c aes(reorder(region, -bookings_count),
+
bookings_count)) geom_col(fill = "steelblue") +
labs(title = "美洲预定数量") +
Theme1
<- ggplot(ame_region,
plot2c aes(reorder(region, -average_rate),
+
average_rate)) geom_col(fill = "steelblue") +
scale_y_continuous(
labels = unit_format(prefix = "$",
suffix = "")) +
labs(title = "美洲平均花销") +
Theme1
<- ggplot(ame_hotel_regions,
plot3c aes(reorder(region, -bookings_count),
bookings_count,fill = hotel)) +
geom_col(position = "dodge") +
scale_fill_manual(
values = c("City Hotel" = "steelblue3",
"Resort Hotel" = "steelblue4")) +
labs(title = "按照城市和度假酒店细分") +
Theme1
<- ggplot(ame_hotel_regions,
plot4c aes(reorder(region, -average_rate),
average_rate,fill = hotel)) +
geom_col(position = "dodge") +
scale_fill_manual(
values = c("City Hotel" = "steelblue3",
"Resort Hotel" = "steelblue4")
+
) scale_y_continuous(
labels = unit_format(prefix = "$",
suffix = "")
+
) labs(title = "按照城市和独家酒店细分") +
Theme1
| plot2c) /(plot3c | plot4c) (plot1c
7.3.4 亚洲
7.3.5 大洋洲
7.3.6 非洲
7.4 不同国家预定数量分布
#1. Summarizing data by city hotels
<- dfc %>%
dfcCity filter(hotel == "City Hotel") %>%
group_by(country) %>%
summarize("total_rate" = sum(total_rates), "bookings_count" = round(log(n()), 1)) %>%
mutate("average_rate" = total_rate / n())
#2. Summarizing data by resort hotels
<- dfc %>%
dfcResort filter(hotel == "Resort Hotel") %>%
group_by(country) %>%
summarize("total_rate" = sum(total_rates), "bookings_count" = round(log(n()), 1)) %>%
mutate("average_rate" = total_rate / n())
#3. Plots
<- highchart() %>%
MapCity hc_add_series_map(worldgeojson, dfcCity,
value = "bookings_count",
joinBy = c("iso3", "country")) %>%
hc_colorAxis(minColor = "#fde725",
maxColor = "#0d0887") %>%
hc_legend(align = "top", verticalAlign = "top",
layout = "horizontal", x = -8, y = 41) %>%
hc_title(text = "City Hotel Bookings Volume by Country", align = "left") %>%
hc_subtitle(text = "Scale in Natural Log", align = "left")
<- highchart() %>%
MapResort hc_add_series_map(worldgeojson, dfcResort,
value = "bookings_count",
joinBy = c("iso3", "country")) %>%
hc_legend(align = "top", verticalAlign = "top",
layout = "horizontal", x = -8, y = 41) %>%
hc_title(text = "Resort Hotel Bookings Volume by Country", align = "left") %>%
hc_subtitle(text = "Scale in Natural Log", align = "left")
#4. Arraging and aligning each plot
hw_grid(MapCity, MapResort)