In [1]:
import pandas as pd
import numpy as np


In [2]:
file_name = "table.txt"

df = pd.read_table(file_name, sep='\t')
print(df.columns) # check the column names, we want library_name to be there!


Index(['study_accession', 'sample_accession', 'secondary_sample_accession',
       'experiment_accession', 'run_accession', 'tax_id', 'scientific_name',
       'instrument_model', 'library_name', 'library_layout', 'fastq_bytes',
       'fastq_ftp'],
      dtype='object')


In [3]:
patient_ids = df['library_name'].astype(str).str[0:6]
patients = set(patient_ids)
print(sorted(patients)) 
# there are 21 patients, but there are supposed to be only 20 patients according to the paper
# KTN609 does not appear in the appendix of the paper: https://www.cell.com/cms/attachment/2119295259/2091819478/mmc1.pdf

['KTN102', 'KTN115', 'KTN126', 'KTN129', 'KTN132', 'KTN134', 'KTN147', 'KTN152', 'KTN155', 'KTN206', 'KTN210', 'KTN215', 'KTN302', 'KTN304', 'KTN310', 'KTN316', 'KTN317', 'KTN501', 'KTN609', 'KTN612', 'KTN615']


In [4]:
# how many data files are there associated with patient KTN102?
patient_id = 'KTN152'
print(np.sum(patient_ids == patient_id)) # there are 511 files associated with this patient


1035


For this dataset, there are 
+ bulk data: BLOOD, PRE, MID, POST, 
+ single cell data: scDNA and scRNA.

Let's extract out the file name patterns.

In [5]:
#df2 = df.loc[patient_ids == patient_id]
df2 = df[patient_ids == patient_id]
sorted(list(df2['library_name']))

['KTN1520',
 'KTN1520cell1',
 'KTN1520cell10',
 'KTN1520cell100',
 'KTN1520cell101',
 'KTN1520cell102',
 'KTN1520cell103',
 'KTN1520cell104',
 'KTN1520cell105',
 'KTN1520cell106',
 'KTN1520cell107',
 'KTN1520cell108',
 'KTN1520cell109',
 'KTN1520cell11',
 'KTN1520cell110',
 'KTN1520cell111',
 'KTN1520cell112',
 'KTN1520cell113',
 'KTN1520cell114',
 'KTN1520cell115',
 'KTN1520cell116',
 'KTN1520cell117',
 'KTN1520cell118',
 'KTN1520cell119',
 'KTN1520cell12',
 'KTN1520cell120',
 'KTN1520cell121',
 'KTN1520cell122',
 'KTN1520cell123',
 'KTN1520cell124',
 'KTN1520cell125',
 'KTN1520cell126',
 'KTN1520cell127',
 'KTN1520cell128',
 'KTN1520cell129',
 'KTN1520cell13',
 'KTN1520cell130',
 'KTN1520cell131',
 'KTN1520cell132',
 'KTN1520cell133',
 'KTN1520cell134',
 'KTN1520cell135',
 'KTN1520cell136',
 'KTN1520cell137',
 'KTN1520cell138',
 'KTN1520cell139',
 'KTN1520cell14',
 'KTN1520cell140',
 'KTN1520cell141',
 'KTN1520cell142',
 'KTN1520cell143',
 'KTN1520cell144',
 'KTN1520cell145',
 'KTN15

Browsing through the values seems to indicate that the library names have certain patterns:
+ KTN102Blood, KTN1020, KTN1022, KTN102OP seem to refer to bulk samples
    - For KTN1020 and KTN102OP, there seem to be library names with the following suffix 'cells[0-9]+'. These are either DNA or RNA data for population (KTN1020 and KTN102OP).
+ The other library names have the following pattern: KTN102\_0/2/OP\_[0-9|A-Z|a-Z]+
    - Some or all of these files are likely to be single cell data, but we do not know which correspond to RNA and which to DNA samples
    - We may be able to use the number of fastq.gz files attached to each library to help identify RNA from DNA samples since RNA samples should be single stranded.
    - We may also use the file sizes to our advantage.

In [16]:
# let's check the data sizes for the files to find the bulk samples
df2 = df2.dropna()
ret = list(map(lambda row: np.asarray(str(row).split(";"), dtype=int), list(df2['fastq_bytes'])))
ret = np.asarray(ret)
ret2 = list(map(lambda row: np.sum(row)/np.power(10,9), ret)) # in GB
ret2 = np.asarray(ret2)
print(list(df2[ret2 > 1]["fastq_ftp"]))


['ftp.sra.ebi.ac.uk/vol1/fastq/SRR590/009/SRR5908339/SRR5908339.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/SRR590/009/SRR5908339/SRR5908339_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/SRR590/009/SRR5908339/SRR5908339_2.fastq.gz', 'ftp.sra.ebi.ac.uk/vol1/fastq/SRR590/000/SRR5908340/SRR5908340.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/SRR590/000/SRR5908340/SRR5908340_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/SRR590/000/SRR5908340/SRR5908340_2.fastq.gz', 'ftp.sra.ebi.ac.uk/vol1/fastq/SRR590/004/SRR5908364/SRR5908364.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/SRR590/004/SRR5908364/SRR5908364_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/SRR590/004/SRR5908364/SRR5908364_2.fastq.gz', 'ftp.sra.ebi.ac.uk/vol1/fastq/SRR590/005/SRR5908365/SRR5908365.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/SRR590/005/SRR5908365/SRR5908365_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/SRR590/005/SRR5908365/SRR5908365_2.fastq.gz']


In [8]:
ret2[ret2 > 1] # large files probably correspond to bulk, matching our intuition

array([ 7.92877125,  7.99732438,  6.66749365,  8.77022955])

In [9]:
# let's check the number of files associated with each row
num_files = np.array(list(map(lambda row: row.shape[0], ret)))

In [19]:
s1 = sorted(df2[num_files == 1]["library_name"])
print(len(s1)) # 143
s1

143


['KTN102_0_10',
 'KTN102_0_11',
 'KTN102_0_12',
 'KTN102_0_13',
 'KTN102_0_14',
 'KTN102_0_16',
 'KTN102_0_17',
 'KTN102_0_18',
 'KTN102_0_19',
 'KTN102_0_2',
 'KTN102_0_20',
 'KTN102_0_21',
 'KTN102_0_22',
 'KTN102_0_23',
 'KTN102_0_24',
 'KTN102_0_25',
 'KTN102_0_26',
 'KTN102_0_27',
 'KTN102_0_28',
 'KTN102_0_29',
 'KTN102_0_3',
 'KTN102_0_30',
 'KTN102_0_31',
 'KTN102_0_32',
 'KTN102_0_33',
 'KTN102_0_34',
 'KTN102_0_35',
 'KTN102_0_36',
 'KTN102_0_37',
 'KTN102_0_38',
 'KTN102_0_39',
 'KTN102_0_4',
 'KTN102_0_40',
 'KTN102_0_41',
 'KTN102_0_42',
 'KTN102_0_43',
 'KTN102_0_44',
 'KTN102_0_45',
 'KTN102_0_46',
 'KTN102_0_47',
 'KTN102_0_48',
 'KTN102_0_5',
 'KTN102_0_6',
 'KTN102_0_7',
 'KTN102_0_8',
 'KTN102_0_9',
 'KTN102_0_Pop',
 'KTN102_2_01',
 'KTN102_2_02',
 'KTN102_2_03',
 'KTN102_2_04',
 'KTN102_2_05',
 'KTN102_2_06',
 'KTN102_2_07',
 'KTN102_2_08',
 'KTN102_2_09',
 'KTN102_2_10',
 'KTN102_2_11',
 'KTN102_2_12',
 'KTN102_2_13',
 'KTN102_2_14',
 'KTN102_2_15',
 'KTN102_2_16',

In [20]:
s2 = sorted(df2[num_files == 2]["library_name"])
print(len(s2)) # 275
s2

275


['KTN1020cells1',
 'KTN1020cells10',
 'KTN1020cells11',
 'KTN1020cells12',
 'KTN1020cells13',
 'KTN1020cells14',
 'KTN1020cells15',
 'KTN1020cells16',
 'KTN1020cells17',
 'KTN1020cells18',
 'KTN1020cells19',
 'KTN1020cells2',
 'KTN1020cells20',
 'KTN1020cells21',
 'KTN1020cells22',
 'KTN1020cells23',
 'KTN1020cells24',
 'KTN1020cells25',
 'KTN1020cells26',
 'KTN1020cells27',
 'KTN1020cells28',
 'KTN1020cells29',
 'KTN1020cells3',
 'KTN1020cells30',
 'KTN1020cells31',
 'KTN1020cells32',
 'KTN1020cells33',
 'KTN1020cells34',
 'KTN1020cells35',
 'KTN1020cells36',
 'KTN1020cells37',
 'KTN1020cells38',
 'KTN1020cells39',
 'KTN1020cells4',
 'KTN1020cells40',
 'KTN1020cells41',
 'KTN1020cells42',
 'KTN1020cells43',
 'KTN1020cells44',
 'KTN1020cells45',
 'KTN1020cells46',
 'KTN1020cells47',
 'KTN1020cells48',
 'KTN1020cells49',
 'KTN1020cells5',
 'KTN1020cells50',
 'KTN1020cells51',
 'KTN1020cells52',
 'KTN1020cells53',
 'KTN1020cells54',
 'KTN1020cells55',
 'KTN1020cells56',
 'KTN1020cells57'

In [21]:
s3 = sorted(df2[num_files == 3]["library_name"])
print(len(s3)) # 93
s3

93


['KTN1020',
 'KTN1022',
 'KTN102Blood',
 'KTN102OP',
 'KTN102_0_10_B2',
 'KTN102_0_11_B2',
 'KTN102_0_12_B2',
 'KTN102_0_14_B2',
 'KTN102_0_15_B2',
 'KTN102_0_16_B2',
 'KTN102_0_17_B2',
 'KTN102_0_18_B2',
 'KTN102_0_19_B2',
 'KTN102_0_1_B2',
 'KTN102_0_20_B2',
 'KTN102_0_22_B2',
 'KTN102_0_23_B2',
 'KTN102_0_24_B2',
 'KTN102_0_25_B2',
 'KTN102_0_26_B2',
 'KTN102_0_27_B2',
 'KTN102_0_28_B2',
 'KTN102_0_29_B2',
 'KTN102_0_2_B2',
 'KTN102_0_30_B2',
 'KTN102_0_31_B2',
 'KTN102_0_32_B2',
 'KTN102_0_33_B2',
 'KTN102_0_34_B2',
 'KTN102_0_35_B2',
 'KTN102_0_36_B2',
 'KTN102_0_37_B2',
 'KTN102_0_38_B2',
 'KTN102_0_39_B2',
 'KTN102_0_40_B2',
 'KTN102_0_41_B2',
 'KTN102_0_43_B2',
 'KTN102_0_44_B2',
 'KTN102_0_45_B2',
 'KTN102_0_46_B2',
 'KTN102_0_47_B2',
 'KTN102_0_48_B2',
 'KTN102_0_4_B2',
 'KTN102_0_5_B2',
 'KTN102_0_6_B2',
 'KTN102_0_7_B2',
 'KTN102_0_8_B2',
 'KTN102_0_9_B2',
 'KTN102_2_10_B2',
 'KTN102_2_11_B2',
 'KTN102_2_12_B2',
 'KTN102_2_13_B2',
 'KTN102_2_14_B2',
 'KTN102_2_15_B2',
 'KTN

+ Rows that are associated with single file seem to be for scRNA
+ Rows associated with two files seem to be for scDNA
+ Rows associated with three files seem to be for bulk and/or single cell data (this will have to be determined by reading the paper) but it's clear from the size of the files, which ones correspond to bulk data.