7  酒店房间预定预测

7.1 读取数据

df <- read_csv("data/hotel_bookings.csv",
               col_names = TRUE,
               show_col_types = FALSE)

# Surpressing summarize info
options(dplyr.summarise.inform = FALSE)

7.2 数据总览

glimpse(df)
Rows: 119,390
Columns: 32
$ hotel                          <chr> "Resort Hotel", "Resort Hotel", "Resort…
$ is_canceled                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
$ lead_time                      <dbl> 342, 737, 7, 13, 14, 14, 0, 9, 85, 75, …
$ arrival_date_year              <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 201…
$ arrival_date_month             <chr> "July", "July", "July", "July", "July",…
$ arrival_date_week_number       <dbl> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,…
$ arrival_date_day_of_month      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ stays_in_weekend_nights        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ stays_in_week_nights           <dbl> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, …
$ adults                         <dbl> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
$ children                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ babies                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ meal                           <chr> "BB", "BB", "BB", "BB", "BB", "BB", "BB…
$ country                        <chr> "PRT", "PRT", "GBR", "GBR", "GBR", "GBR…
$ market_segment                 <chr> "Direct", "Direct", "Direct", "Corporat…
$ distribution_channel           <chr> "Direct", "Direct", "Direct", "Corporat…
$ is_repeated_guest              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_cancellations         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ reserved_room_type             <chr> "C", "C", "A", "A", "A", "A", "C", "C",…
$ assigned_room_type             <chr> "C", "C", "C", "A", "A", "A", "C", "C",…
$ booking_changes                <dbl> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ deposit_type                   <chr> "No Deposit", "No Deposit", "No Deposit…
$ agent                          <chr> "NULL", "NULL", "NULL", "304", "240", "…
$ company                        <chr> "NULL", "NULL", "NULL", "NULL", "NULL",…
$ days_in_waiting_list           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ customer_type                  <chr> "Transient", "Transient", "Transient", …
$ adr                            <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98.00,…
$ required_car_parking_spaces    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ total_of_special_requests      <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 3, …
$ reservation_status             <chr> "Check-Out", "Check-Out", "Check-Out", …
$ reservation_status_date        <date> 2015-07-01, 2015-07-01, 2015-07-02, 20…

7.2.1 数据清洗

在快速浏览数据集以初步了解变量及其格式后,下一步是检查每列中缺失值的数量及其各自的大小。

将”NULL”转换为NA,将”CN”转换为CHN

# 1.将数据集中的date列临时转换为字符串,便于将NULL转换为NA
dftemp1 <- transform(
  df,
  reservation_status_date = 
    as.character(reservation_status_date))

# 2. 将NULL转化为NA,将CN转换为CHN
dftemp1[dftemp1 == "NULL"] <- NA
dftemp1[dftemp1 == "CN"] <- "CHN"

# 3. 除 iso3c 代码外,还添加国家/地区名称、大洲和地区
dftemp1$country_name <- countrycode(
  dftemp1$country, "iso3c", "country.name"
)
dftemp1$continent <- countrycode(
  dftemp1$country, "iso3c", "continent"
)
dftemp1$region <- countrycode(
  dftemp1$country, "iso3c", "region23"
)

# 4. 将date列变会date
dftemp2 <- 
  transform(dftemp1,
            reservation_status_date =
              as.Date(reservation_status_date))

# 5. 计算缺失值数量
miss_var_summary(dftemp2) %>% 
  filter(n_miss != 0)
# A tibble: 7 × 3
  variable     n_miss pct_miss
  <chr>         <int>    <dbl>
1 company      112593 94.3    
2 agent         16340 13.7    
3 continent       495  0.415  
4 region          495  0.415  
5 country_name    491  0.411  
6 country         488  0.409  
7 children          4  0.00335
  • 我们发现,company列中有112593个缺失值,占该列总数据的95%左右,只是单纯的将该列缺失值删除显然不太合适。

  • 我们将country,country_name,continent,agent和company,等NAs替换为None,将children替换为0。

7.2.2 缺失值处理

# 1. 将所有NA替换为0
dftemp2[is.na(dftemp2)] <- 0

# 2. 将country, agent and company 列中的0值替换为 "None" 
dftemp3 <- dftemp2 %>% 
  mutate(country = replace(country, 
                           country == 0,
                           "None"),
         agent = replace(agent,
                         agent == 0, 
                         "None"),
         company = replace(company,
                           company == 0,
                           "None"),
         country_name = replace(country_name,
                                country_name == "0",
                                "None")
  )
glimpse(dftemp3)
Rows: 119,390
Columns: 35
$ hotel                          <chr> "Resort Hotel", "Resort Hotel", "Resort…
$ is_canceled                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
$ lead_time                      <dbl> 342, 737, 7, 13, 14, 14, 0, 9, 85, 75, …
$ arrival_date_year              <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 201…
$ arrival_date_month             <chr> "July", "July", "July", "July", "July",…
$ arrival_date_week_number       <dbl> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,…
$ arrival_date_day_of_month      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ stays_in_weekend_nights        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ stays_in_week_nights           <dbl> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, …
$ adults                         <dbl> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
$ children                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ babies                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ meal                           <chr> "BB", "BB", "BB", "BB", "BB", "BB", "BB…
$ country                        <chr> "PRT", "PRT", "GBR", "GBR", "GBR", "GBR…
$ market_segment                 <chr> "Direct", "Direct", "Direct", "Corporat…
$ distribution_channel           <chr> "Direct", "Direct", "Direct", "Corporat…
$ is_repeated_guest              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_cancellations         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ reserved_room_type             <chr> "C", "C", "A", "A", "A", "A", "C", "C",…
$ assigned_room_type             <chr> "C", "C", "C", "A", "A", "A", "C", "C",…
$ booking_changes                <dbl> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ deposit_type                   <chr> "No Deposit", "No Deposit", "No Deposit…
$ agent                          <chr> "None", "None", "None", "304", "240", "…
$ company                        <chr> "None", "None", "None", "None", "None",…
$ days_in_waiting_list           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ customer_type                  <chr> "Transient", "Transient", "Transient", …
$ adr                            <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98.00,…
$ required_car_parking_spaces    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ total_of_special_requests      <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 3, …
$ reservation_status             <chr> "Check-Out", "Check-Out", "Check-Out", …
$ reservation_status_date        <date> 2015-07-01, 2015-07-01, 2015-07-02, 20…
$ country_name                   <chr> "Portugal", "Portugal", "United Kingdom…
$ continent                      <chr> "Europe", "Europe", "Europe", "Europe",…
$ region                         <chr> "Southern Europe", "Southern Europe", "…

7.2.3 数据整合

  1. 增加四个新列:

    • total_nights:总住宿天数
    • total_rates:总房价
    • total_guests:总房客
    • total_nights_rates_guests
dftemp3 <- dftemp2 %>% 
  mutate(
    total_nights =
      (stays_in_weekend_nights +
      stays_in_week_nights),
    total_rates = 
      (total_nights * adr),
    total_guests = (adults + children +
                      babies),
    total_nights_rates_guests = (
      total_nights + total_guests +
        total_rates
    ))
  1. 增加arrival_month_year列。
方便可视化
dftemp4 <- transform(
  dftemp3, 
  arrival_date_year = 
    as.character(arrival_date_year)) %>% 
  mutate(
    month_year_temp = 
      paste("1", arrival_date_month,
                 arrival_date_year)) %>% 
  mutate(
    arrival_monty_year = 
      make_date(month_year_temp))
dftemp4$arrival_monty_year <-
  dmy(dftemp4$month_year_temp)

7.2.4 数据完整新检查

计算没有夜晚、房价或客人记录的观测值数量.

dftemp4 %>% 
  filter(total_nights_rates_guests == 0) %>% 
  count()
   n
1 70

下一步是通过过滤掉没有夜晚、房价和客人记录的 70 行并删除total_nights_rate_guests变量,最终确定数据集以准备进行分析。

7.2.5 数据定稿

dfc <- dftemp4 %>% 
  filter(total_nights_rates_guests != 0) %>% 
  select(., -total_nights_rates_guests,
         -month_year_temp)
glimpse(dfc)
Rows: 119,320
Columns: 39
$ hotel                          <chr> "Resort Hotel", "Resort Hotel", "Resort…
$ is_canceled                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
$ lead_time                      <dbl> 342, 737, 7, 13, 14, 14, 0, 9, 85, 75, …
$ arrival_date_year              <chr> "2015", "2015", "2015", "2015", "2015",…
$ arrival_date_month             <chr> "July", "July", "July", "July", "July",…
$ arrival_date_week_number       <dbl> 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,…
$ arrival_date_day_of_month      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ stays_in_weekend_nights        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ stays_in_week_nights           <dbl> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, …
$ adults                         <dbl> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
$ children                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ babies                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ meal                           <chr> "BB", "BB", "BB", "BB", "BB", "BB", "BB…
$ country                        <chr> "PRT", "PRT", "GBR", "GBR", "GBR", "GBR…
$ market_segment                 <chr> "Direct", "Direct", "Direct", "Corporat…
$ distribution_channel           <chr> "Direct", "Direct", "Direct", "Corporat…
$ is_repeated_guest              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_cancellations         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ reserved_room_type             <chr> "C", "C", "A", "A", "A", "A", "C", "C",…
$ assigned_room_type             <chr> "C", "C", "C", "A", "A", "A", "C", "C",…
$ booking_changes                <dbl> 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ deposit_type                   <chr> "No Deposit", "No Deposit", "No Deposit…
$ agent                          <chr> "0", "0", "0", "304", "240", "240", "0"…
$ company                        <chr> "0", "0", "0", "0", "0", "0", "0", "0",…
$ days_in_waiting_list           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ customer_type                  <chr> "Transient", "Transient", "Transient", …
$ adr                            <dbl> 0.00, 0.00, 75.00, 75.00, 98.00, 98.00,…
$ required_car_parking_spaces    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ total_of_special_requests      <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 3, …
$ reservation_status             <chr> "Check-Out", "Check-Out", "Check-Out", …
$ reservation_status_date        <date> 2015-07-01, 2015-07-01, 2015-07-02, 20…
$ country_name                   <chr> "Portugal", "Portugal", "United Kingdom…
$ continent                      <chr> "Europe", "Europe", "Europe", "Europe",…
$ region                         <chr> "Southern Europe", "Southern Europe", "…
$ total_nights                   <dbl> 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, …
$ total_rates                    <dbl> 0.00, 0.00, 75.00, 75.00, 196.00, 196.0…
$ total_guests                   <dbl> 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
$ arrival_monty_year             <date> 2015-07-01, 2015-07-01, 2015-07-01, 20…

7.3 数据初步探索

descr(dfc) %>% 
  round(., 2) %>% 
  t()
Descriptive Statistics  
dfc  
N: 119320  

                                         Mean   Std.Dev      Min       Q1   Median       Q3
------------------------------------ -------- --------- -------- -------- -------- --------
                                 adr   101.89     50.49    -6.38    69.36    94.80   126.00
                              adults     1.86      0.58     0.00     2.00     2.00     2.00
           arrival_date_day_of_month    15.80      8.78     1.00     8.00    16.00    23.00
            arrival_date_week_number    27.16     13.60     1.00    16.00    28.00    38.00
                              babies     0.01      0.10     0.00     0.00     0.00     0.00
                     booking_changes     0.22      0.65     0.00     0.00     0.00     0.00
                            children     0.10      0.40     0.00     0.00     0.00     0.00
                days_in_waiting_list     2.32     17.60     0.00     0.00     0.00     0.00
                         is_canceled     0.37      0.48     0.00     0.00     0.00     1.00
                   is_repeated_guest     0.03      0.17     0.00     0.00     0.00     0.00
                           lead_time   104.07    106.87     0.00    18.00    69.00   160.00
      previous_bookings_not_canceled     0.14      1.50     0.00     0.00     0.00     0.00
              previous_cancellations     0.09      0.84     0.00     0.00     0.00     0.00
         required_car_parking_spaces     0.06      0.25     0.00     0.00     0.00     0.00
                stays_in_week_nights     2.50      1.91     0.00     1.00     2.00     3.00
             stays_in_weekend_nights     0.93      1.00     0.00     0.00     1.00     2.00
                        total_guests     1.97      0.72     0.00     2.00     2.00     2.00
                        total_nights     3.43      2.56     0.00     2.00     3.00     4.00
           total_of_special_requests     0.57      0.79     0.00     0.00     0.00     1.00
                         total_rates   358.06    335.90   -63.80   146.00   267.00   446.40

Table: Table continues below

 

                                           Max      MAD      IQR      CV   Skewness   SE.Skewness
------------------------------------ --------- -------- -------- ------- ---------- -------------
                                 adr   5400.00    41.22    56.64    0.50      10.57          0.01
                              adults     55.00     0.00     0.00    0.31      18.49          0.01
           arrival_date_day_of_month     31.00    11.86    15.00    0.56       0.00          0.01
            arrival_date_week_number     53.00    16.31    22.00    0.50      -0.01          0.01
                              babies     10.00     0.00     0.00   12.25      24.64          0.01
                     booking_changes     21.00     0.00     0.00    2.95       6.00          0.01
                            children     10.00     0.00     0.00    3.84       4.11          0.01
                days_in_waiting_list    391.00     0.00     0.00    7.58      11.94          0.01
                         is_canceled      1.00     0.00     1.00    1.30       0.54          0.01
                   is_repeated_guest      1.00     0.00     0.00    5.54       5.36          0.01
                           lead_time    737.00    88.96   142.00    1.03       1.35          0.01
      previous_bookings_not_canceled     72.00     0.00     0.00   10.93      23.54          0.01
              previous_cancellations     26.00     0.00     0.00    9.69      24.45          0.01
         required_car_parking_spaces      8.00     0.00     0.00    3.92       4.16          0.01
                stays_in_week_nights     50.00     1.48     2.00    0.76       2.86          0.01
             stays_in_weekend_nights     19.00     1.48     2.00    1.08       1.38          0.01
                        total_guests     55.00     0.00     0.00    0.37      10.22          0.01
                        total_nights     69.00     1.48     2.00    0.75       3.31          0.01
           total_of_special_requests      5.00     0.00     1.00    1.39       1.35          0.01
                         total_rates   7590.00   206.82   300.40    0.94       2.98          0.01

Table: Table continues below

 

                                       Kurtosis     N.Valid   Pct.Valid
------------------------------------ ---------- ----------- -----------
                                 adr    1017.33   119320.00      100.00
                              adults    1367.51   119320.00      100.00
           arrival_date_day_of_month      -1.19   119320.00      100.00
            arrival_date_week_number      -0.99   119320.00      100.00
                              babies    1632.90   119320.00      100.00
                     booking_changes      79.39   119320.00      100.00
                            children      18.66   119320.00      100.00
                days_in_waiting_list     186.67   119320.00      100.00
                         is_canceled      -1.71   119320.00      100.00
                   is_repeated_guest      26.76   119320.00      100.00
                           lead_time       1.70   119320.00      100.00
      previous_bookings_not_canceled     767.15   119320.00      100.00
              previous_cancellations     673.76   119320.00      100.00
         required_car_parking_spaces      29.98   119320.00      100.00
                stays_in_week_nights      24.31   119320.00      100.00
             stays_in_weekend_nights       7.17   119320.00      100.00
                        total_guests     559.08   119320.00      100.00
                        total_nights      28.91   119320.00      100.00
           total_of_special_requests       1.49   119320.00      100.00
                         total_rates      17.15   119320.00      100.00

7.3.1 按大洲划分的预订量和平均每晚花费

# 1. 计算酒店及大洲数据
dfccharts <- dfc %>% 
  group_by(hotel, continent) %>% 
  summarise(
    total_rate = sum(total_rates),
    bookings_count = n()
  ) %>% 
  mutate(
    average_rate = 
      total_rate / bookings_count) %>% 
  filter(continent != 0)

# 2. 仅计算大洲数据
dfcchartscontonly <- dfc %>% 
  group_by(continent) %>% 
  summarise(total_rate = sum(total_rates),
            bookings_count = n()) %>% 
  mutate(average_rate = 
           total_rate / bookings_count) %>% 
  filter(continent != 0)

# 3. 核定图形主题
Theme1 <- theme(
  legend.position = "top",
  legend.title = element_blank(),
  plot.title = element_text(size = 15),
  axis.title = element_blank(),
  axis.text.x = element_text(size = 5),
  axis.text.y = element_text(size = 5),
  legend.text = element_text(size = 10),
  legend.spacing.x = unit(4, "mm"),
  legend.justification = "left"
)

# 4. 作图
plot1a <- ggplot(
  dfccharts, aes(x = reorder(continent,
                            -bookings_count),
                 y = bookings_count)) +
  geom_col(fill = "steelblue") +
  scale_y_continuous(labels = unit_format(
    suffix = "K", scale = 0.001
  )) +
  labs(title = "各大洲预定总数") +
  Theme1

plot2a <- ggplot(
  dfccharts, aes(x = reorder(continent,
                             -bookings_count),
                 y = bookings_count,
                 fill = hotel)) +
  geom_bar(position = "dodge", stat = "identity") +
  scale_fill_manual(
    values = c("City Hotel" = "steelblue3",
               "Resort Hotel" = "steelblue4")
  ) +
  scale_y_continuous(
    labels = unit_format(suffix = "K", scale = 0.001)) +
  labs(title = "按照城市酒店和度假酒店细分") +
  Theme1

plot3a <- ggplot(
  dfcchartscontonly, aes(x = reorder(continent,
                                     -average_rate),
                         y = average_rate)) +
  geom_col(fill = "steelblue") +
  scale_y_continuous(labels = unit_format(
    perfix = "$", suffix = ""
  )) +
  labs(title = "各大洲每天预定酒店平均消费") +
  Theme1

plot4a <- ggplot(dfccharts, 
                 aes(x = continent,
                     y = average_rate, 
                     fill = hotel)) +
  geom_bar(position = "dodge", stat = "identity") +
  scale_fill_manual(
    values = c("City Hotel" = "steelblue3",
               "Resort Hotel" = "steelblue4")
  ) +
  labs(title = "按城市酒店和度假酒店细分") +
  Theme1
(plot1a | plot3a) /(plot2a | plot4a)

  • 欧洲似乎是迄今为止最受欢迎的目的地,并且强烈偏爱城市酒店与度假村。

  • 非洲是最昂贵的大陆,平均每晚住宿费用超过600美元。

  • 除非洲外,度假酒店的平均花费更高。而美洲城市和度假酒店之间的平均房价似乎相似。

下面,我们深入到各个区域,以了解花费的具体不同。

7.3.2 欧洲

  1. 筛选欧洲大陆的数据-by region & hotel
dfchotelregions <- dfc %>% 
  filter(continent == "Europe") %>% 
  group_by(hotel, region) %>% 
  summarise(total_rate = sum(total_rates),
            bookings_count = n()) %>% 
  mutate(average_rate = total_rate / bookings_count)
head(dfchotelregions)
# A tibble: 6 × 5
# Groups:   hotel [2]
  hotel        region          total_rate bookings_count average_rate
  <chr>        <chr>                <dbl>          <int>        <dbl>
1 City Hotel   Eastern Europe     745533.           1866         400.
2 City Hotel   Northern Europe   3290622.           8565         384.
3 City Hotel   Southern Europe  10209285.          39229         260.
4 City Hotel   Western Europe    7750375.          20929         370.
5 Resort Hotel Eastern Europe     492552.            820         601.
6 Resort Hotel Northern Europe   4945907.           9745         508.
  1. 筛选欧洲大陆的数据-by region & hotel
dfcregiononly <- dfc %>% 
  filter(continent == "Europe") %>% 
  group_by(region) %>% 
  summarise(total_rate = sum(total_rates),
            bookings_count = n()) %>% 
  mutate(average_rate = total_rate / bookings_count)
head(dfchotelregions)
# A tibble: 6 × 5
# Groups:   hotel [2]
  hotel        region          total_rate bookings_count average_rate
  <chr>        <chr>                <dbl>          <int>        <dbl>
1 City Hotel   Eastern Europe     745533.           1866         400.
2 City Hotel   Northern Europe   3290622.           8565         384.
3 City Hotel   Southern Europe  10209285.          39229         260.
4 City Hotel   Western Europe    7750375.          20929         370.
5 Resort Hotel Eastern Europe     492552.            820         601.
6 Resort Hotel Northern Europe   4945907.           9745         508.
  1. 作图
plot1b <- ggplot(dfcregiononly,
                 aes(
                   reorder(region, -bookings_count),
                   bookings_count)) +
  geom_col(fill = "steelblue") +
  labs(title = "欧洲预定数量") +
  Theme1

plot2b <- ggplot(dfcregiononly,
                 aes(reorder(region, -average_rate),
                     average_rate)) +
  geom_col(fill = "steelblue") +
  labs(title = "欧洲平均花销") +
  Theme1

plot3b <- ggplot(dfchotelregions,
                 aes(reorder(region, -bookings_count),
                     bookings_count,
                     fill = hotel)) +
  geom_bar(position = "dodge", stat = "identity") +
  scale_fill_manual(
    values = c("City Hotel" = "steelblue3",
               "Resort Hotel" = "steelblue4")) +
  scale_y_continuous(
    labels = unit_format(perfix = "$",
                         suffix = "")) +
  labs(title = "按照城市和度假酒店细分") +
  Theme1

plot4b <- ggplot(dfchotelregions,
                 aes(reorder(region, -average_rate),
                     average_rate,
                     fill = hotel)) +
  geom_bar(position = "dodge", stat = "identity") +
  scale_fill_manual(
    values = c("City Hotel" = "steelblue3",
               "Resort Hotel" = "steelblue4")) +
  scale_y_continuous(
    labels = unit_format(prefix = "$",
                         suffix = "")
  ) +
  labs(title = "按照城市和度假酒店细分") +
  Theme1

(plot1b | plot2b) / (plot3b | plot4b)

7.3.3 美洲

ame_hotel_regions <- dfc %>% 
  filter(continent == "Americas") %>% 
  group_by(hotel, region) %>% 
  summarise(
    total_rate = sum(total_rates),
    bookings_count = n()) %>% 
  mutate(
    average_rate = total_rate/bookings_count)

ame_region <- dfc %>% 
  filter(continent == "Americas") %>% 
  group_by(region) %>% 
  summarise(
    total_rate = sum(total_rates),
    bookings_count = n()
  ) %>% 
  mutate(
    average_rate = total_rate/bookings_count)
plot1c <- ggplot(ame_region,
       aes(reorder(region, -bookings_count), 
           bookings_count)) +
  geom_col(fill = "steelblue") +
  labs(title = "美洲预定数量") +
  Theme1

plot2c <- ggplot(ame_region,
                 aes(reorder(region, -average_rate),
                     average_rate)) +
  geom_col(fill = "steelblue") +
  scale_y_continuous(
    labels = unit_format(prefix = "$", 
                         suffix = "")) +
  labs(title = "美洲平均花销") +
  Theme1
  
plot3c <- ggplot(ame_hotel_regions,
       aes(reorder(region, -bookings_count),
           bookings_count,
           fill = hotel)) +
  geom_col(position = "dodge") +
  scale_fill_manual(
    values = c("City Hotel" = "steelblue3",
               "Resort Hotel" = "steelblue4")) +
  labs(title = "按照城市和度假酒店细分") +
  Theme1

plot4c <- ggplot(ame_hotel_regions,
                 aes(reorder(region, -average_rate),
                     average_rate,
                     fill = hotel)) +
  geom_col(position = "dodge") +
  scale_fill_manual(
    values = c("City Hotel" = "steelblue3",
               "Resort Hotel" = "steelblue4")
    ) +
    scale_y_continuous(
      labels = unit_format(prefix = "$",
                           suffix = "")
    ) +
  labs(title = "按照城市和独家酒店细分") +
  Theme1

(plot1c | plot2c) /(plot3c | plot4c)

7.3.4 亚洲

7.3.5 大洋洲

7.3.6 非洲

7.4 不同国家预定数量分布

#1. Summarizing data by city hotels
dfcCity <- dfc %>%
    filter(hotel == "City Hotel") %>%
    group_by(country) %>%
    summarize("total_rate" = sum(total_rates), "bookings_count" = round(log(n()), 1)) %>%
    mutate("average_rate" = total_rate / n())

#2. Summarizing data by resort hotels
dfcResort <- dfc %>%
    filter(hotel == "Resort Hotel") %>%
    group_by(country) %>%
    summarize("total_rate" = sum(total_rates), "bookings_count" = round(log(n()), 1)) %>%
    mutate("average_rate" = total_rate / n())

#3. Plots
MapCity <- highchart() %>%
  hc_add_series_map(worldgeojson, dfcCity, 
                    value = "bookings_count", 
                    joinBy = c("iso3", "country")) %>%
  hc_colorAxis(minColor = "#fde725", 
                 maxColor = "#0d0887") %>%
  hc_legend(align = "top", verticalAlign = "top", 
            layout = "horizontal", x = -8, y = 41) %>%
  hc_title(text = "City Hotel Bookings Volume by Country", align = "left") %>%
  hc_subtitle(text = "Scale in Natural Log", align = "left")

MapResort <- highchart() %>%
  hc_add_series_map(worldgeojson, dfcResort, 
                    value = "bookings_count", 
                    joinBy = c("iso3", "country")) %>%
  hc_legend(align = "top", verticalAlign = "top", 
            layout = "horizontal", x = -8, y = 41) %>%
  hc_title(text = "Resort Hotel Bookings Volume by Country", align = "left") %>%
  hc_subtitle(text = "Scale in Natural Log", align = "left")

#4. Arraging and aligning each plot
hw_grid(MapCity, MapResort)