Align sumstats with RICOPILI pipeline

Sumstats from Generation Scotland need to be aligned with the PGC imputation panel (1KG).

library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

daner <- read_tsv('daner_mdd_genscot_1119a_rmUKBB.gz')

## Parsed with column specification:
## cols(
##   CHR = col_integer(),
##   SNP = col_character(),
##   BP = col_integer(),
##   A1 = col_character(),
##   A2 = col_character(),
##   FRQ_A_951 = col_double(),
##   FRQ_U_6114 = col_double(),
##   INFO = col_double(),
##   OR = col_double(),
##   SE = col_double(),
##   P = col_double(),
##   ngt = col_integer()
## )

List and load alignments files. The document suggests pulling from files with the suffix *.EUR.frq2.gz but these do not exist for the circa 2012 imputation files that were used for MDD2, so instead we open the *.eur.bfile.bim files.

eur_bim_files <- 
list.files('/home/gwas/pgc-samples/hapmap_ref/impute2_ref/1KG_Aug12/ALL_1000G_phase1integrated_v3_impute_macGT1/', pattern="my.ALL_1000G_phase1integrated_v3_aug2012_macGT1_chr.+\\.eur\\.bfile\\.bim", full.names=TRUE)

# specify column times i=integer, c=character
eur_bim <- bind_rows(lapply(eur_bim_files, read_table2, col_names=c('CHR', 'snpid', 'CM', 'BP', 'A1', 'A2'), col_types='iciicc'))

Merge sumstats on CPIDs CHR_POS_A1_A2 and CHR_POS_A2_A1

daner_cpid <-
bind_rows(
daner %>% 
  inner_join(eur_bim, by=c('CHR'='CHR', 'BP'='BP', 'A1'='A1', 'A2'='A2')),
daner %>% 
  inner_join(eur_bim, by=c('CHR'='CHR', 'BP'='BP', 'A1'='A2', 'A2'='A1'))
)

Number of sites with differerent RSIDs

daner_cpid %>%
filter(SNP != snpid) %>%
tally()

## # A tibble: 1 x 1
##       n
##   <int>
## 1 49169

Unmatched sites

daner_cpid_nomatch <-
daner %>%
anti_join(daner_cpid, by=c('CHR', 'SNP'))

Update SNP ID

daner_aligned <-
daner_cpid %>%
select(CHR, SNP=snpid, BP, A1, A2, starts_with('FRQ'), INFO, OR, SE, P, ngt) %>%
bind_rows(daner_cpid_nomatch) %>%
arrange(CHR, BP)

write_tsv(daner_aligned, 'daner_mdd_genscot_1119a_rmUKBB.aligned.gz')

Align sumstats with RICOPILI pipeline

Mark Adams