import requests import json import re files_endpt = "https://api.gdc.cancer.gov/files" filters = { "op": "and", "content": [ { "op": "in", "content": { "field": "cases.project.primary_site", "value": [ "Lung" ] } }, { "op": "in", "content": { "field": "cases.demographic.race", "value": [ "white" ] } }, { "op": "in", "content": { "field": "cases.demographic.gender", "value": [ "female" ] } }, { "op": "in", "content": { "field": "files.analysis.workflow_type", "value": [ "STAR - Counts" ] } }, { "op": "in", "content": { "field": "files.access", "value": [ "open" ] } } ] } # Here a GET is used, so the filter parameters should be passed as a JSON string. params = { "filters": json.dumps(filters), "fields": "file_id", "format": "JSON", "size": "10" } response = requests.get(files_endpt, params = params) file_uuid_list = [] # This step populates the download list with the file_ids from the previous query for file_entry in json.loads(response.content.decode("utf-8"))["data"]["hits"]: file_uuid_list.append(file_entry["file_id"]) data_endpt = "https://api.gdc.cancer.gov/data" params = {"ids": file_uuid_list} response = requests.post(data_endpt, data = json.dumps(params), headers = {"Content-Type": "application/json"}) response_head_cd = response.headers["Content-Disposition"] file_name = re.findall("filename=(.+)", response_head_cd)[0] with open(file_name, "wb") as output_file: output_file.write(response.content)