5: Joining data

Code for Quiz 6, more dplyr and our first interactive chart using echarts4r.

Steps 1-6

  1. Load the R packages we will use.
library(tidyverse)
library(echarts4r)  #install this package before using
library(hrbrthemes) #install this package before using
  1. Read the data in the files, drug_cos.csv, health_cos.csv in to R and assign to the variables drug_cos and health_cos, respectively
drug_cos  <- read_csv("https://estanny.com/static/week6/drug_cos.csv")
health_cos  <- read_csv("https://estanny.com/static/week6/health_cos.csv")
  1. Use glimpse to get a glimpse of the data
drug_cos %>% glimpse()
Rows: 104
Columns: 9
$ ticker       <chr> "ZTS", "ZTS", "ZTS", "ZTS", "ZTS", "ZTS", "ZTS"…
$ name         <chr> "Zoetis Inc", "Zoetis Inc", "Zoetis Inc", "Zoet…
$ location     <chr> "New Jersey; U.S.A", "New Jersey; U.S.A", "New …
$ ebitdamargin <dbl> 0.149, 0.217, 0.222, 0.238, 0.182, 0.335, 0.366…
$ grossmargin  <dbl> 0.610, 0.640, 0.634, 0.641, 0.635, 0.659, 0.666…
$ netmargin    <dbl> 0.058, 0.101, 0.111, 0.122, 0.071, 0.168, 0.163…
$ ros          <dbl> 0.101, 0.171, 0.176, 0.195, 0.140, 0.286, 0.321…
$ roe          <dbl> 0.069, 0.113, 0.612, 0.465, 0.285, 0.587, 0.488…
$ year         <dbl> 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018,…
health_cos %>% glimpse()  
Rows: 464
Columns: 11
$ ticker      <chr> "ZTS", "ZTS", "ZTS", "ZTS", "ZTS", "ZTS", "ZTS",…
$ name        <chr> "Zoetis Inc", "Zoetis Inc", "Zoetis Inc", "Zoeti…
$ revenue     <dbl> 4233000000, 4336000000, 4561000000, 4785000000, …
$ gp          <dbl> 2581000000, 2773000000, 2892000000, 3068000000, …
$ rnd         <dbl> 427000000, 409000000, 399000000, 396000000, 3640…
$ netincome   <dbl> 245000000, 436000000, 504000000, 583000000, 3390…
$ assets      <dbl> 5711000000, 6262000000, 6558000000, 6588000000, …
$ liabilities <dbl> 1975000000, 2221000000, 5596000000, 5251000000, …
$ marketcap   <dbl> NA, NA, 16345223371, 21572007994, 23860348635, 2…
$ year        <dbl> 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, …
$ industry    <chr> "Drug Manufacturers - Specialty & Generic", "Dru…
  1. Which variables are the same in both data sets
names_drug  <- drug_cos  %>%  names() 
names_health  <- health_cos  %>%  names() 
intersect(names_drug, names_health)
[1] "ticker" "name"   "year"  
  1. Select subset of variables to work with
drug_subset  <- drug_cos  %>% 
  select(ticker, year, grossmargin)  %>% 
  filter(year == 2018)

health_subset  <- health_cos  %>%
  select(ticker, year, revenue, gp, industry)  %>% 
  filter(year == 2018)
  1. Keep all the rows and columns drug_subset join with columns in health_subset
drug_subset  %>% left_join(health_subset)
# A tibble: 13 x 6
   ticker  year grossmargin   revenue        gp industry              
   <chr>  <dbl>       <dbl>     <dbl>     <dbl> <chr>                 
 1 ZTS     2018       0.672   5.82e 9   3.91e 9 Drug Manufacturers - …
 2 PRGO    2018       0.387   4.73e 9   1.83e 9 Drug Manufacturers - …
 3 PFE     2018       0.79    5.36e10   4.24e10 Drug Manufacturers - …
 4 MYL     2018       0.35    1.14e10   4.00e 9 Drug Manufacturers - …
 5 MRK     2018       0.681   4.23e10   2.88e10 Drug Manufacturers - …
 6 LLY     2018       0.738   2.46e10   1.81e10 Drug Manufacturers - …
 7 JNJ     2018       0.668   8.16e10   5.45e10 Drug Manufacturers - …
 8 GILD    2018       0.781   2.21e10   1.73e10 Drug Manufacturers - …
 9 BMY     2018       0.71    2.26e10   1.60e10 Drug Manufacturers - …
10 BIIB    2018       0.865   1.35e10   1.16e10 Drug Manufacturers - …
11 AMGN    2018       0.827   2.37e10   1.96e10 Drug Manufacturers - …
12 AGN     2018       0.861   1.58e10   1.36e10 Drug Manufacturers - …
13 ABBV    2018       0.764   3.28e10   2.50e10 Drug Manufacturers - …

Question: join_ticker

drug_cos_subset  <- drug_cos  %>% 
  ???(??? == "???")

drug_cos_subset
???  <- drug_cos_subset  %>% 
  left_join(health_cos)

combo_df

???  <- combo_df  %>% 
  distinct(name) %>% 
  pull()

co_location  <- ???  %>% 
  ???(???)  %>% 
  pull() 

co_industry  <- ???  %>% 
  ???(???)  %>% 
  ???() 

Put the r inline commands used in the blanks below. When you knit the document the results of the commands will be displayed in your text.

The company ??? is located in ??? and is a member of the ??? industry group.


combo_df_subset  <- combo_df  %>% 
  select(???, ???, ???, 
  ???, ???, ???)

???

combo_df_subset  %>% 
  mutate(grossmargin_check = ??? / ???,
  close_enough = abs(grossmargin_check - grossmargin) < 0.001)

???  %>% 
  ???(netmargin_check = ??? / ???,
  close_enough = ???(netmargin_check - netmargin) < 0.001)

Question: summarize_industry

health_cos  %>% 
  group_by(???)  %>% 
  summarize(??? = ???(??? / revenue) * 100,
            ??? = ???(??? / revenue) * 100,
            ??? = ???(??? / revenue) * 100,
            ??? = ???(??? / revenue) * 100
  ) 

Question: inline_ticker

health_cos_subset  <- health_cos  %>% 
  ???(ticker == "???")
health_cos_subset 


Run the code below

health_cos_subset  %>% 
  distinct(name) %>%  
  pull(name)
???  <- health_cos_subset  %>% 
  distinct(name) %>% 
  pull(name)

You can take output from your code and include it in your text.

In following chuck

co_industry  <- ???  %>% 
  ???(industry) %>% 
  ???()

This is outside the R chunk. Put the r inline commands used in the blanks below. When you knit the document the results of the commands will be displayed in your text.

The company ??? is a member of the ??? group.


Steps 7-11

  1. Prepare the data for the plots
df <- health_cos  %>% 
  group_by(industry)  %>%
  summarize(med_rnd_rev = median(rnd/revenue))   
  1. Use glimpse to glimpse the data for the plots
df  %>% glimpse()
Rows: 9
Columns: 2
$ industry    <chr> "Biotechnology", "Diagnostics & Research", "Drug…
$ med_rnd_rev <dbl> 0.48317287, 0.05620271, 0.17451442, 0.06851879, …
  1. Create a static bar chart
ggplot(data = df, 
       mapping = aes(
         x = reorder(industry, med_rnd_rev ),
         y = med_rnd_rev
         )) +
  geom_col() + 
  scale_y_continuous(labels = scales::percent) +
  coord_flip() +
  labs(
    title = "Median R&D expenditures",
    subtitle = "by industry as a percent of revenue from 2011 to 2018",
    x = NULL, y = NULL) +
  theme_ipsum()

  1. Save the previous plot to preview.png and add to the yaml chunk at the top
ggsave(filename = "preview.png", 
       path = here::here("_posts", "2021-02-27-joining-data"))
  1. Create an interactive bar chart using the package echarts4r
df  %>% 
  arrange(med_rnd_rev)  %>%
  e_charts(
    x = industry
    )  %>% 
  e_bar(
    serie = med_rnd_rev, 
    name = "median"
    )  %>%
  e_flip_coords()  %>% 
  e_tooltip()  %>% 
  e_title(
    text = "Median industry R&D expenditures", 
    subtext = "by industry as a percent of revenue from 2011 to 2018",
    left = "center") %>% 
  e_legend(FALSE) %>% 
  e_x_axis(
    formatter = e_axis_formatter("percent", digits = 0)
    )  %>%
  e_y_axis(
    show = FALSE
  )  %>% 
  e_theme("infographic")