UKB-project / 01_Parquet_extraction.R
01_Parquet_extraction.R
Raw
###############################################################################################

## Parquet extraction

###############################################################################################

# Try initial extraction of parquet files
# There are 23 parts to the parquet files which are split by individuals into chunks for whole of UKB
# arrow::read_parquet()
# data.table::rbindlist(lapply(Sys.glob("path/part*parquet))
# Sys.setenv("ARROW_WITH_SNAPPY" = "ON")
# install.packages("arrow",force = TRUE)

# Load packages
library(arrow)
library(data.table)
library(tidyverse)

# Read in one parquet file as a test 
file <- read_parquet('path/...')

# Read in the parquet files for the whole of UKB and bind together
my_df <- data.table::rbindlist(lapply(Sys.glob('path/...'), arrow::read_parquet))

# Protein measurements for 54,189 individuals and 1,474 protein levels - read in to source IDs (App_26041)
olink_internal = read.csv('path/...')

# Subset UKB data to just those with Olink IDs
my_df3 <- my_df[which(my_df$f.eid %in% olink_internal$App_26041),] # 54189 

write.csv(my_df3, 'path/...', row.names = F)