1 Neighbourhood boundaries

1.1 BCC suburbs

# orig data missing???
SUB <- read_csv("data-raw/geo/suburb-and-adjoining-suburb-november-2019.zip") %>% 
  clean_names() %>%
  remove_empty(c("rows", "cols")) %>% 
  select(suburb_name) %>% 
  rename(SSC_NAME16 = suburb_name) %>%  
  distinct() %>% 
  mutate(SSC_NAME16 = str_to_title(SSC_NAME16)) %>% 
  arrange(SSC_NAME16)

write_rds(SUB, "data/geo/clean/SUB.Rds")

This might very well include areas with no pops (and therefore no SEIFA), for instance:

# A tibble: 2 × 1
  SSC_NAME16      
  <chr>           
1 Brisbane Airport
2 Port Of Brisbane

1.2 ABS

Names from Brisbane containing (Brisbane - Qld), names from Qld containing (Qld) have to be cleaned to match BCC data.

Mcdowall is called McDowall and Mount Coot-tha is Mount Coot-Tha - these have been unified as well.

unzip("data-raw/geo/1270055003_ssc_2016_aust_shape.zip", 
      exdir = "data-raw/geo")

SSC <- st_read("data-raw/geo/1270055003_ssc_2016_aust_shape/SSC_2016_AUST.shp", 
               stringsAsFactors = FALSE) %>% 
  mutate(SSC_CODE16 = as.integer(SSC_CODE16)) %>% 
  select(-STE_NAME16, -STE_CODE16, -AREASQKM16) %>%     
  st_transform(3112) %>%    
  filter(!st_is_empty(geometry)) %>% 
  mutate(SSC_NAME16_orig = SSC_NAME16) %>%  
  mutate(SSC_NAME16 = str_remove(SSC_NAME16, 
                                 fixed(" (Brisbane - Qld)"))) %>% 
  mutate(SSC_NAME16 = str_remove(SSC_NAME16, 
                                 fixed(" (Qld)"))) %>% 
  mutate(SSC_NAME16 = ifelse(SSC_NAME16 == "McDowall",
                             "Mcdowall", SSC_NAME16)) %>% 
  mutate(SSC_NAME16 = ifelse(SSC_NAME16 == "Mount Coot-tha",
                             "Mount Coot-Tha", SSC_NAME16))

# SSC <- rmapshaper::ms_simplify(SSC, keep = 0.05, weighting = 0.7) # default settings

write_rds(SSC, "data/geo/SSC_2016_AUST.Rds")

unlink("data-raw/geo/1270055003_ssc_2016_aust_shape", recursive = TRUE)

Stones Corner doesn’t exist in ABS but it does in BCC. It seems it’s part of Greenslopes.

# A tibble: 1 × 3
  SSC_NAME16    SSC_CODE16 SSC_NAME16_orig
  <chr>              <int> <chr>          
1 Stones Corner         NA <NA>

After clean the datasets were merged.

Full map

2 SEIFA

SEIFA <- read_xls("data-raw/SEIFA/2033055001 - ssc indexes.xls", 
                  sheet = "Table 1", skip = 5, n_max = 13719, na = "-") %>% 
  clean_names() %>%
  remove_empty(c("rows", "cols")) %>% 
  dplyr::rename(SSC_CODE16 = x1,
                SSC_NAME16 = x2,
                IRSD = score_3,
                IRSD_d = decile_4,
                IRSAD = score_5,
                IRSAD_d = decile_6,
                IER = score_7,
                IER_d = decile_8,
                IEO = score_9,
                IEO_d = decile_10,
                URP = x11,
                caution = x12) %>% 
  # deciles kept as integers here
  mutate(SSC_CODE16 = as.integer(SSC_CODE16),
         IRSD = as.integer(IRSD),
         IRSAD = as.integer(IRSAD),
         IER = as.integer(IER),
         IEO = as.integer(IEO),
         IRSD_d = as.integer(IRSD_d),
         IRSAD_d = as.integer(IRSAD_d),
         IER_d = as.integer(IER_d),
         IEO_d = as.integer(IEO_d),
         URP = as.integer(URP)
  ) %>% 
  mutate(caution = as.logical(ifelse(is.na(caution), "False", "True")))

2.1 Coverage

Comparing Australia & Brisbane, example of IRSD.

2.1.1 Frequencies table

2.1.1.1 Australia

x <integer> 
# total N=13713 valid N=13691 mean=5.50 sd=2.87

Value |    N | Raw % | Valid % | Cum. %
---------------------------------------
    1 | 1369 |  9.98 |   10.00 |  10.00
    2 | 1369 |  9.98 |   10.00 |  20.00
    3 | 1370 |  9.99 |   10.01 |  30.01
    4 | 1370 |  9.99 |   10.01 |  40.01
    5 | 1367 |  9.97 |    9.98 |  50.00
    6 | 1370 |  9.99 |   10.01 |  60.00
    7 | 1369 |  9.98 |   10.00 |  70.00
    8 | 1369 |  9.98 |   10.00 |  80.00
    9 | 1371 | 10.00 |   10.01 |  90.02
   10 | 1367 |  9.97 |    9.98 | 100.00
 <NA> |   22 |  0.16 |    <NA> |   <NA>

2.1.1.2 Brisbane

x <integer> 
# total N=193 valid N=184 mean=7.79 sd=2.37

Value |  N | Raw % | Valid % | Cum. %
-------------------------------------
    1 |  3 |  1.55 |    1.63 |   1.63
    2 |  6 |  3.11 |    3.26 |   4.89
    3 |  5 |  2.59 |    2.72 |   7.61
    4 |  5 |  2.59 |    2.72 |  10.33
    5 | 13 |  6.74 |    7.07 |  17.39
    6 | 12 |  6.22 |    6.52 |  23.91
    7 | 28 | 14.51 |   15.22 |  39.13
    8 | 18 |  9.33 |    9.78 |  48.91
    9 | 33 | 17.10 |   17.93 |  66.85
   10 | 61 | 31.61 |   33.15 | 100.00
 <NA> |  9 |  4.66 |    <NA> |   <NA>

2.1.2 Frequencies plot

2.1.2.1 Australia

2.1.2.2 Brisbane

2.1.3 Range per decile

2.1.3.1 Australia

2.1.3.2 Brisbane

2.2 Missing

Few areas with missing SEIFA

          SSC_NAME16 IRSD_d IRSAD_d IER_d IEO_d URP
1        Banks Creek     NA      NA    NA    NA  NA
2   Brisbane Airport     NA      NA    NA    NA  NA
3         Eagle Farm     NA      NA    NA    NA  NA
4 Enoggera Reservoir     NA      NA    NA    10  25
5          Karawatha     NA      NA    NA    NA  NA
6          Larapinta     NA      NA    NA    NA  NA
7             Lytton     NA      NA    NA    NA  NA
8     Mount Coot-Tha     NA      NA    NA    NA  NA
9   Port Of Brisbane     NA      NA    NA    NA  NA

These were excluded.

2.3 Caution

Few cases with ABS flag caution.

     SSC_NAME16 IRSD IRSD_d IRSAD IRSAD_d  IER IER_d  IEO IEO_d URP
1        Bulwer 1014      6   996       6 1012     5 1024     7  49
2   Cowan Cowan 1014      6   996       6 1012     5 1024     7  28
3 England Creek 1004      5   978       5 1063     8  949     3  33
4     Kooringal 1014      6   996       6 1012     5 1024     7  45

Usually with very small pop numbers.

These remain included.

2.4 ‘Local’ deciles

Original values of indices were used to calculate ‘local deciles’ using SSCs for Brisbane only

SSC %<>% 
  mutate(IRSD_d_orig = IRSD_d,
         IRSD_d = ntile(IRSD, 10),
         IRSAD_d_orig = IRSAD_d,
         IRSAD_d = ntile(IRSAD, 10),
         IER_d_orig = IER_d,
         IER_d = ntile(IER, 10),
         IEO_d_orig = IEO_d,
         IEO_d = ntile(IEO, 10)) %>% 
  mutate(across(ends_with("_d"), factor)) %>% 
  mutate(across(ends_with("_d_orig"), factor)) %>% 
  relocate(IRSD_d_orig, .after = IRSD_d) %>% 
  relocate(IRSAD_d_orig, .after = IRSAD_d) %>% 
  relocate(IER_d_orig, .after = IER_d) %>% 
  relocate(IEO_d_orig, .after = IEO_d)

Note: Deciles are represented as factor variable. Certain application could benefit from changing it to ordered?

Agreement across specific indices :

2.4.1 `IRSD_d`

IRSD_d_orig == IRSD_d <lgl> 
# total N=184 valid N=184 mean=0.11 sd=0.32

Value |   N | Raw % | Valid % | Cum. %
--------------------------------------
FALSE | 163 | 88.59 |   88.59 |  88.59
TRUE  |  21 | 11.41 |   11.41 | 100.00
<NA>  |   0 |  0.00 |    <NA> |   <NA>

2.4.2 `IRSAD_d`

IRSAD_d_orig == IRSAD_d <lgl> 
# total N=184 valid N=184 mean=0.11 sd=0.32

Value |   N | Raw % | Valid % | Cum. %
--------------------------------------
FALSE | 163 | 88.59 |   88.59 |  88.59
TRUE  |  21 | 11.41 |   11.41 | 100.00
<NA>  |   0 |  0.00 |    <NA> |   <NA>

2.4.3 `IER_d`

IER_d_orig == IER_d <lgl> 
# total N=184 valid N=184 mean=0.71 sd=0.45

Value |   N | Raw % | Valid % | Cum. %
--------------------------------------
FALSE |  53 | 28.80 |   28.80 |  28.80
TRUE  | 131 | 71.20 |   71.20 | 100.00
<NA>  |   0 |  0.00 |    <NA> |   <NA>

2.4.4 `IEO_d`

IEO_d_orig == IEO_d <lgl> 
# total N=184 valid N=184 mean=0.12 sd=0.33

Value |   N | Raw % | Valid % | Cum. %
--------------------------------------
FALSE | 162 | 88.04 |   88.04 |  88.04
TRUE  |  22 | 11.96 |   11.96 | 100.00
<NA>  |   0 |  0.00 |    <NA> |   <NA>

3 Dog cost

dog_cost <- read_xlsx("data-raw/costs/dog_expensive.xlsx") %>% 
  clean_names() %>%
  remove_empty(c("rows", "cols")) %>% 
  select(-web_source) %>% 
  rename(dog_breed = breed) %>%  
  select(dog_breed) %>% 
  distinct() %>% 
  # correcting names for better matching - these one used in BCE
  mutate(
    dog_breed = case_when(
      dog_breed == "Hairless Chinese Crested" ~ "Chinese Crested Dog",
      dog_breed == "Saint Bernard" ~ "St Bernard",
      TRUE ~ as.character(dog_breed))
  ) %>% 
  # synonyms
  add_row(dog_breed = "Dogue de Bordeaux") %>% 
  add_row(dog_breed = "Bulldog") %>% 
  add_row(dog_breed = "British Bulldog") %>% 
  mutate(expensive = "yes") %>% 
  arrange(dog_breed)

Top 20 most expensive dogs (+3 synonyms!)

4 Dog insurance

Data scraped from https://top10petinsurance.com.au/pet-insurance-prices on the 30th March 2020

# packages needed
# install.packages("rvest")

p_load(rvest, tidyverse)

# scraping table done with this using chrome: https://www.r-bloggers.com/using-rvest-to-scrape-an-html-table/
url <- 'https://top10petinsurance.com.au/pet-insurance-prices/'   

dog_insurance <- url %>%
  xml2::read_html() %>%
  html_nodes(xpath='//*[@id="post-1016"]/div/table') %>%
  html_table()

dog_insurance <- dog_insurance[[1]]

head(dog_insurance)

write_rds(dog_insurance, "data-raw/costs/dog_insurance.Rds") # extracted on the 30 March 2020

dog_insurance <- read_rds("data-raw/costs/dog_insurance.Rds") %>% 
  as_tibble() %>% 
  clean_names() %>%
  remove_empty(c("rows", "cols")) %>% 
  # select(-) %>% 
  mutate(average_accident_policy_cost_annual = 
           gsub(",", "",
                average_accident_policy_cost_annual, 
                fixed = TRUE),
         average_illness_policy_cost_annual = 
           gsub(",", "",
                average_illness_policy_cost_annual, 
                fixed = TRUE),
         average_comprehensive_policy_cost_annual = 
           gsub(",", "",
                average_comprehensive_policy_cost_annual, 
                fixed = TRUE)
  ) %>% 
  mutate(average_accident_policy_cost_annual = 
           as.numeric(gsub("$", "",
                           average_accident_policy_cost_annual, 
                           fixed = TRUE)),
         average_illness_policy_cost_annual = 
           as.numeric(gsub("$", "",
                           average_illness_policy_cost_annual, 
                           fixed = TRUE)),
         average_comprehensive_policy_cost_annual = 
           as.numeric(gsub("$", "",
                           average_comprehensive_policy_cost_annual, 
                           fixed = TRUE))
  ) %>% 
  filter(dog_breed != "Unknown Dog Breed")

4.1 Three major categories

cost_compared_to_other_breeds <character> 
# total N=557 valid N=557 mean=1.97 sd=0.42

Value                       |   N | Raw % | Valid % | Cum. %
------------------------------------------------------------
Above average               |  58 | 10.41 |   10.41 |  10.41
Below average               | 457 | 82.05 |   82.05 |  92.46
Significantly above average |  42 |  7.54 |    7.54 | 100.00
<NA>                        |   0 |  0.00 |    <NA> |   <NA>

4.2 Individual breeds

Breeds in Significantly above average category:

Breeds in Above average category:

5 Dog ownership in Brisbane

Raw dataset named cars-bis-open-data-animal-permits-3-jan-2019 consists of 107,405 records.

5.1 Data selection

Excluding records with:

permit_name: Breeders Permit, Cattery Permit, Racehorses Permit, Pet Shop Permit, Domestic Dog Permit & Guard Dog Permit
without neighbourhood
without dog_breed values
with dog_breed listed as Unknown or Cross

Stones Corner values were assigned to Greenslopes SSC (see information in section above).

dog_ownership %<>% 
  clean_names() %>%
  remove_empty(c("rows", "cols")) %>% 
  # all the same here
  select(-permit_group, -permit_status) %>% 
  # special permits?
  filter(!permit_name %in% c("Breeders Permit", "Cattery Permit", "Racehorses Permit", "Pet Shop Permit")) %>% 
  filter(!permit_name %in% c("Domestic Dog Permit", "Guard Dog Permit")) %>%
  rename(dog_breed = animal_breed,
         SSC_NAME16 = application_location_suburb) %>% 
  mutate(SSC_NAME16 = str_to_title(SSC_NAME16)) %>% 
  # correct suburb
  mutate(SSC_NAME16 = ifelse(SSC_NAME16 == "Stones Corner", "Greenslopes", SSC_NAME16)) %>% 
  # missing geo
  filter(!is.na(SSC_NAME16)) %>% 
  # missing breed
  filter(!is.na(dog_breed)) %>% 
  filter(!dog_breed %in% c("Unknown", 
                           "Medium Cross Breed", "Small Cross Breed", "Large Cross Breed")) %>% 
  # few cleans for better matches
  mutate(
    dog_breed = case_when(
      dog_breed == "German Shepherd Dog (Long Stock Coat)" ~ "German Shepherd",
      dog_breed == "German Shepherd Dog" ~ "German Shepherd",
      dog_breed == "Central Asian Shepherd Dog " ~ "Central Asian Shepherd",
      dog_breed == "Kangal Dog" ~ "Kangal",
      dog_breed == "Bulldog" ~ "British Bulldog",
      dog_breed == "Collie (Rough)" ~ "Rough Collie", 
      dog_breed == "Collie (Smooth)" ~ "Smooth Collie",      
      
      TRUE ~ as.character(dog_breed))
  )

Prepared dataset consists of 106,018 records.

5.2 Data merging - `dog_cost`

dog_ownership_cost <- left_join(dog_ownership, dog_cost, by = "dog_breed") %>% 
  # binary indicator "expensive" and "non-expensive" dog breeds according to dog_cost
  mutate(
    expensive = case_when(
      is.na(expensive) ~ 0,
      expensive == "yes" ~ 1
    ),
    expensive = as.integer(expensive)
  )

Expensive dogs in Brisbane:

expensive <integer> 
# total N=106018 valid N=106018 mean=0.14 sd=0.35

Value |     N | Raw % | Valid % | Cum. %
----------------------------------------
    0 | 90940 | 85.78 |   85.78 |  85.78
    1 | 15078 | 14.22 |   14.22 | 100.00
 <NA> |     0 |  0.00 |    <NA> |   <NA>

5.3 Data merging - `dog_insurance`

Correcting names for better match

dog_insurance %<>%
  mutate(
    dog_breed = case_when(
      # this is a bit tricky! might need sensitivity?
      dog_breed == "French Poodle" ~ "Poodle",  
      
      # just naming issues
      dog_breed == "Poodle – Standard" ~ "Poodle (Standard)",
      dog_breed == "Miniature Poodle" ~ "Poodle (Miniature)",
      dog_breed == "Poodle – Toy" ~ "Poodle (Toy)",
      dog_breed == "Shar-Pei" ~ "Shar Pei", 
      dog_breed == "German Short Haired Pointer" ~ "German Shorthaired Pointer", 
      dog_breed == "German Wire Haired Pointer" ~ "German Wirehaired Pointer", 
      dog_breed == "Collie – Rough" ~ "Rough Collie", 
      dog_breed == "Collie – Smooth" ~ "Smooth Collie", 
      dog_breed == "Miniature Schnauzer" ~ "Schnauzer (Miniature)", 
      dog_breed == "Schnauzer Giant" ~ "Schnauzer (Giant)", 
      dog_breed == "Lagotto Rom" ~ "Lagotto Romagnolo", 
      dog_breed == "Brittany Spaniel" ~ "Brittany", 
      dog_breed == "Staghound" ~ "Stag Hound", 
      dog_breed == "Kerry Blue" ~ "Kerry Blue Terrier", 
      dog_breed == "English Toy terrier" ~ "English Toy Terrier",
      dog_breed == "Parson Jack Russell Terrier" ~ "Parson Russell Terrier", 
      dog_breed == "Welsh Corgi – Pembroke" ~ "Welsh Corgi (Pembroke)", 
      dog_breed == "American Cocker Spaniel" ~ "Cocker Spaniel (American)",
      dog_breed == "Basset Fauve De Bretagne" ~ "Basset Fauve de Bretagne", 
      dog_breed == "Norwegian Elk Hound" ~ "Norwegian Elkhound", 
      dog_breed == "Cheasapeake Bay Retriever" ~ "Chesapeake Bay Retriever", 
      dog_breed == "Bouvier Des Flandres" ~ "Bouvier des Flandres", 
      dog_breed == "Miniature Bull Terrier" ~ "Bull Terrier (Miniature)", 
      dog_breed == "Munsterlander – Large" ~ "Large Munsterlander", 
      dog_breed == "Welsh Corgi – Cardigan" ~ "Welsh Corgi (Cardigan)", 
      dog_breed == "HamiltonStovare" ~ "Hamiltonstovare",
      dog_breed == "Blue tick Coonhound" ~ "Bluetick Coonhound",
      dog_breed == "Japanese Akita" ~ "Akita (Japanese)",
      dog_breed == "Dogue De Bordeaux" ~ "Dogue de Bordeaux",
      dog_breed == "Italian Cane Corso" ~ "Italian Corso Dog",
      
      # different kelpies but same category anyway
      dog_breed == "Australian Kelpie Sheepdog" ~ "Australian Kelpie", 
      
      # typo
      dog_breed == "Neopolitan Mastiff" ~ "Neapolitan Mastiff", 
      TRUE ~ as.character(dog_breed)))

Dog breeds without match

Some further corrections still possible here:

dog_ownership_cost <-  left_join(dog_ownership_cost, 
                                 dog_insurance %>% 
                                   select(dog_breed, cost_compared_to_other_breeds)
) %>% 
  mutate(
    cost_compared_to_other_breeds = case_when(
      # all the same
      dog_breed == "Fox Terrier" ~ "Below average",
      dog_breed == "Fox Terrier (Smooth)" ~ "Below average",
      dog_breed == "Fox Terrier (Wire)" ~ "Below average",
      dog_breed == "Schnauzer" ~ "Below average",
      dog_breed == "Australian Stumpy Tail Cattle Dog" ~ "Below average",
      dog_breed == "Foxhound" ~ "Below average",
      dog_breed == "White Swiss Shepherd Dog" ~ "Below average",
      
      # multiple options here, but all above going for conservative
      dog_breed == "Welsh Corgi" ~ "Above average",
      
      # taking values from Chihuahua
      dog_breed == "Chihuahua (Smooth Coat)" ~ "Below average",
      dog_breed == "Chihuahua (Long Coat)" ~ "Below average",
      dog_breed == "Chihuahua (Long Coat)" ~ "Below average",
      
      # naming - Dogue De Bordeaux
      dog_breed == "French Mastiff" ~ "Significantly above average",
      TRUE ~ as.character(cost_compared_to_other_breeds))) %>% 
  select(-permit_name)

Few things left:

Two largest groups:

Pointer - insufficient info!
Akita - insufficient info! Could be Inu, could be Japanese

These observations will remain as NAs.

6 Dogs per capita

Summarizing all dogs, and expensive only.

dog_agg <- dog_ownership_cost %>% 
  group_by(SSC_NAME16) %>% 
  summarise(dogs_total = n())

SSC %<>% 
  left_join(dog_agg)

6.1 Missing SEIFA

Areas with no URP/SEIFA but having (small amount of) dogs:

          SSC_NAME16 dogs_total
1         Eagle Farm          1
2 Enoggera Reservoir          3
3          Karawatha          6
4             Lytton          1

These areas were excluded from the dog_ownership_cost dataset.

6.2 Missing dogs

Area with low URP (also marked as caution == TRUE in SEIFA) and no dogs at all:

     SSC_NAME16 dogs_total URP
1 England Creek         NA  33

SSC %<>% 
  filter(dogs_total > 0) %>% 
  mutate(dogs_percap = (dogs_total / URP)*100) %>% 
  relocate(geometry, .after = last_col())

This area was excluded.

6.3 Ranking

6.4 Map

7 Wide format datasets

7.1 All breeds

wide_all_n <- dog_ownership_cost %>% 
  group_by(SSC_NAME16, dog_breed) %>% 
  summarise(n = n()) %>% 
  ungroup() %>% 
  pivot_wider(names_from = dog_breed, 
              values_from = n) %>% 
  mutate_all(~replace(., is.na(.), 0)) %>% 
  clean_names() %>% 
  rename(SSC_NAME16 = ssc_name16)

wide_all_n <- right_join(SSC, wide_all_n)

wide_all_p <- dog_ownership_cost %>% 
  group_by(SSC_NAME16, dog_breed) %>% 
  summarise(n = n()) %>% 
  ungroup() %>% 
  group_by(SSC_NAME16) %>% 
  mutate(N = sum(n)) %>% 
  ungroup() %>% 
  mutate(p = n/N) %>% 
  select(-n, -N) %>% 
  pivot_wider(names_from = dog_breed, 
              values_from = p) %>% 
  mutate_all(~replace(., is.na(.), 0)) %>% 
  clean_names() %>% 
  rename(SSC_NAME16 = ssc_name16)

wide_all_p <- right_join(SSC, wide_all_p)

7.2 Only expensive breeds (cost)

wide_cost_n <- dog_ownership_cost %>% 
  filter(expensive == 1) %>% 
  group_by(SSC_NAME16, dog_breed) %>% 
  summarise(n = n()) %>% 
  ungroup() %>% 
  pivot_wider(names_from = dog_breed, 
              values_from = n) %>% 
  mutate_all(~replace(., is.na(.), 0)) %>% 
  clean_names() %>% 
  rename(SSC_NAME16 = ssc_name16)

wide_cost_n <- right_join(SSC, wide_cost_n)

wide_cost_p <- dog_ownership_cost %>% 
  filter(expensive == 1) %>% 
  group_by(SSC_NAME16, dog_breed) %>% 
  summarise(n = n()) %>% 
  ungroup() %>% 
  group_by(SSC_NAME16) %>% 
  mutate(N = sum(n)) %>% 
  ungroup() %>% 
  mutate(p = n/N) %>% 
  select(-n, -N) %>% 
  pivot_wider(names_from = dog_breed, 
              values_from = p) %>% 
  mutate_all(~replace(., is.na(.), 0)) %>% 
  clean_names() %>% 
  rename(SSC_NAME16 = ssc_name16)

wide_cost_p <- right_join(SSC, wide_cost_p)

7.3 Only expensive breeds (insurance)

wide_insurance_n <- dog_ownership_cost %>% 
  # filter(cost_compared_to_other_breeds != "Below average") %>% 
  filter(cost_compared_to_other_breeds == "Significantly above average") %>% 
  group_by(SSC_NAME16, dog_breed) %>% 
  summarise(n = n()) %>% 
  ungroup() %>% 
  pivot_wider(names_from = dog_breed, 
              values_from = n) %>% 
  mutate_all(~replace(., is.na(.), 0)) %>% 
  clean_names() %>% 
  rename(SSC_NAME16 = ssc_name16)

wide_insurance_n <- right_join(SSC, wide_insurance_n)

wide_insurance_p <- dog_ownership_cost %>% 
  # filter(cost_compared_to_other_breeds != "Below average") %>% 
  filter(cost_compared_to_other_breeds == "Significantly above average") %>% 
  group_by(SSC_NAME16, dog_breed) %>% 
  summarise(n = n()) %>% 
  ungroup() %>% 
  group_by(SSC_NAME16) %>% 
  mutate(N = sum(n)) %>% 
  ungroup() %>% 
  mutate(p = n/N) %>% 
  select(-n, -N) %>% 
  pivot_wider(names_from = dog_breed, 
              values_from = p) %>% 
  mutate_all(~replace(., is.na(.), 0)) %>% 
  clean_names() %>% 
  rename(SSC_NAME16 = ssc_name16)

wide_insurance_p <- right_join(SSC, wide_insurance_p)

8 ML data

8.1 Saves

Using the Index of Education and Occupation (IEO) only as outcome. More info about the index from ABS.

Keeping all breeds and wide formats as described above; droppping spatial info.

wide_all_n %<>% 
  sf::st_drop_geometry() %>% 
  # as_tibble() %>% 
  select(IEO, IEO_d, 
         akita:last_col())

wide_all_p %<>%  
  sf::st_drop_geometry() %>% 
  # as_tibble() %>% 
  select(IEO, IEO_d, 
         akita:last_col()) %>% 
  rename(outcome = IEO,
         outcome_d = IEO_d)

8.2 Near zero-variance

nzv <- nearZeroVar(wide_all_p, saveMetrics = TRUE)

nzv <- nearZeroVar(wide_all_p)
wide_all_p <- wide_all_p[, -nzv]
wide_all_n <- wide_all_n[, -nzv]
rm(nzv)

wide_all_n %<>% 
  mutate(weight = rowSums(across(where(is.numeric)))) %>% 
  select(weight)

8.3 Correlated breeds

descr_cor <- cor(wide_all_p %>% select(-outcome, -outcome_d)) %>% 
  as_tibble(rownames = "rowname") %>% 
  pivot_longer(-rowname) %>% 
  filter(rowname != name) %>% 
  arrange(desc(value)) %>% 
  filter(row_number() %% 2 == 1) %>% 
  mutate(value = scales::number(value, accuracy = 0.01)) %>% 
  rename(correlation = value)

8.3.1 Linear combinations

combo_info <- findLinearCombos(wide_all_p %>% select(-outcome, -outcome_d))
combo_info

$linearCombos
list()

$remove
NULL

8.4 Train split

75/25 split, taking into account distribution of deciles:

inTrain <- createDataPartition(
  y = wide_all_p$outcome_d,
  p = .75,
  list = FALSE
)

training <- wide_all_p[ inTrain, ]
testing  <- wide_all_p[-inTrain, ]

training_weights <- wide_all_n[ inTrain,]

Training data:

outcome_d <categorical> 
# total N=143 valid N=143 mean=5.45 sd=2.88

Value |  N | Raw % | Valid % | Cum. %
-------------------------------------
    1 | 14 |  9.79 |    9.79 |   9.79
    2 | 15 | 10.49 |   10.49 |  20.28
    3 | 15 | 10.49 |   10.49 |  30.77
    4 | 15 | 10.49 |   10.49 |  41.26
    5 | 14 |  9.79 |    9.79 |  51.05
    6 | 14 |  9.79 |    9.79 |  60.84
    7 | 14 |  9.79 |    9.79 |  70.63
    8 | 14 |  9.79 |    9.79 |  80.42
    9 | 14 |  9.79 |    9.79 |  90.21
   10 | 14 |  9.79 |    9.79 | 100.00
 <NA> |  0 |  0.00 |    <NA> |   <NA>

Testing data:

outcome_d <categorical> 
# total N=40 valid N=40 mean=5.50 sd=2.91

Value | N | Raw % | Valid % | Cum. %
------------------------------------
    1 | 4 |    10 |      10 |     10
    2 | 4 |    10 |      10 |     20
    3 | 4 |    10 |      10 |     30
    4 | 4 |    10 |      10 |     40
    5 | 4 |    10 |      10 |     50
    6 | 4 |    10 |      10 |     60
    7 | 4 |    10 |      10 |     70
    8 | 4 |    10 |      10 |     80
    9 | 4 |    10 |      10 |     90
   10 | 4 |    10 |      10 |    100
 <NA> | 0 |     0 |    <NA> |   <NA>

Outcome distro checks

9 Computing Environment

 R version 4.2.0 (2022-04-22 ucrt)
 Platform: x86_64-w64-mingw32/x64 (64-bit)
 Running under: Windows 10 x64 (build 18363)
 
 Matrix products: default
 
 attached base packages:
 [1] stats     graphics  grDevices utils     datasets  methods   base     
 
 other attached packages:
  [1] caret_6.0-92     lattice_0.20-45  tmap_3.3-3       sf_1.0-8        
  [5] gmodels_2.18.1.1 DT_0.23          sjPlot_2.8.10    sjmisc_2.8.9    
  [9] janitor_2.1.0    scales_1.2.0     magrittr_2.0.3   readxl_1.4.0    
 [13] forcats_0.5.1    stringr_1.4.0    dplyr_1.0.9      purrr_0.3.4     
 [17] readr_2.1.2      tidyr_1.2.0      tibble_3.1.7     ggplot2_3.3.6   
 [21] tidyverse_1.3.2  pacman_0.5.1

To cite R in publications use:

R Core Team (2022). R: A Language and Environment for Statistical Computing. R Foundation for Statistical Computing, Vienna, Austria. https://www.R-project.org/.

To cite the ggplot2 package in publications use:

Wickham H (2016). ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York. ISBN 978-3-319-24277-4, https://ggplot2.tidyverse.org.

`{=html}

Dog-SEP

Data preps

Radoslaw Panczak

20 July, 2022