############################################################################################### ## Parquet extraction ############################################################################################### # Try initial extraction of parquet files # There are 23 parts to the parquet files which are split by individuals into chunks for whole of UKB # arrow::read_parquet() # data.table::rbindlist(lapply(Sys.glob("path/part*parquet)) # Sys.setenv("ARROW_WITH_SNAPPY" = "ON") # install.packages("arrow",force = TRUE) # Load packages library(arrow) library(data.table) library(tidyverse) # Read in one parquet file as a test file <- read_parquet('path/...') # Read in the parquet files for the whole of UKB and bind together my_df <- data.table::rbindlist(lapply(Sys.glob('path/...'), arrow::read_parquet)) # Protein measurements for 54,189 individuals and 1,474 protein levels - read in to source IDs (App_26041) olink_internal = read.csv('path/...') # Subset UKB data to just those with Olink IDs my_df3 <- my_df[which(my_df$f.eid %in% olink_internal$App_26041),] # 54189 write.csv(my_df3, 'path/...', row.names = F)