Downloading Sentinel-2 data from Google Cloud Storage with Python?

Question

I'm trying to download Sentinel-2 data from the Google Cloud Storage and basically adapted the FeLS-module (great work, by the way!). As it is done in the module, I also download the index.csv first (latest version here, but be careful... it's huge!) to search for scenes that match my requirements. But after several successful tests, I figured that there are scenes available within the bucket that are not listed in the csv.

I don't know why and have already contacted Google about this, so now I'm looking for another solution: is there a way to establish a connection to the tile I'm looking for and then list all available subfolders? For example, I would like to do something like this:

import urllib2
from bs4 import BeautifulSoup

url = 'https://console.cloud.google.com/storage/browser/gcp-public-data-sentinel-2/tiles/39/R/YH/'
page = urllib2.urlopen(url)
soup = BeautifulSoup('html.parser')
# the following has been working for another link (https://landsatonaws.com/L8/001/003/), ...
# ... this is just to make clear what I want to do:
table = soup('table')[0].find_all('td')
scenes = [table[i].string for i in range(0, len(table), 3)]

For the given Google Cloud Storage url, I never get something back except for an HTTPError: HTTP Error 404: Not Found.

If I could get rid of searching the index.csv, I would truly get all scenes that are available. Is this possible somehow?

s6hebern · Accepted Answer

I was able to achieve it using the module google-cloud-bigquery. You need a Google Cloud BigQuery key-file for this, which you can create by following these instructions. You also need a project from which you have to know the project-ID, then you can do something like this:

import os
import requests
from google.cloud import bigquery
from google.oauth2 import service_account

BASE_URL = 'http://storage.googleapis.com/'

def query_sentinel(key_json, project_id, start, end, tile, cloud=100.):
    credentials = service_account.Credentials.from_service_account_file(key_json)
    client = bigquery.Client(credentials=credentials, project=project_id)
    query = client.query("""
                SELECT * FROM `bigquery-public-data.cloud_storage_geo_index.sentinel_2_index` 
                    WHERE (mgrs_tile = '{t}' AND 
                    CAST(SUBSTR(sensing_time, 1, 10) AS DATE) >= CAST('{s}' AS DATE) AND 
                    CAST(SUBSTR(sensing_time, 1, 10) AS DATE) < CAST('{e}' AS DATE))
                """.format(t=tile, s=start, e=end))
    results = query.result()
    df = results.to_dataframe()
    good_scenes = []
    for i, row in df.iterrows():
        print row['product_id'], '; cloud cover:', row['cloud_cover']
        if float(row['cloud_cover']) <= cloud:
            good_scenes.append(row['base_url'].replace('gs://', BASE_URL))
    return good_scenes

Afterwards, you can download the manifest.safe to get the typical SAFE-structure and download all necessary files, for example like this:

def download_file(url, dst_name):
    try:
        data = requests.get(url, stream=True)
        with open(dst_name, 'wb') as out_file:
            for chunk in data.iter_content(chunk_size=100 * 100):
                out_file.write(chunk)
    except:
        print 't ... {f} FAILED!'.format(f=url.split('/')[-1])
    return

def make_safe_dirs(scene, outpath):
    scene_name = os.path.basename(scene)
    scene_path = os.path.join(outpath, scene_name)
    manifest = os.path.join(scene_path, 'manifest.safe')
    manifest_url = scene + '/manifest.safe'
    if os.path.exists(manifest):
        os.remove(manifest)
    download_file(manifest_url, manifest)
    with open(manifest, 'r') as f:
        manifest_lines = f.read().split()
    download_links = []
    load_this = False
    for line in manifest_lines:
        if 'href' in line:
            online_path = line[7:line.find('><') - 2]
            tile = scene_name.split('_')[-2]
            if online_path.startswith('/GRANULE/'):
                if '_' + tile + '_' in online_path:
                    load_this = True
            else:
                load_this = True
            if load_this:
                local_path = os.path.join(scene_path, *online_path.split('/')[1:])
                online_path = scene + online_path
                download_links.append((online_path, local_path))
        load_this = False
    for extra_dir in ('AUX_DATA', 'HTML'):
        if not os.path.exists(os.path.join(scene_path, extra_dir)):
            os.makedirs(os.path.join(scene_path, extra_dir))
    return download_links

def download_sentinel(scene, dst):
    scene_name = scene.split('/')[-1]
    scene_path = os.path.join(dst, scene_name)
    if not os.path.exists(scene_path):
        os.mkdir(scene_path)
    print 'Downloading scene {s} ...'.format(s=scene_name)
    download_links = sorted(make_safe_dirs(scene, dst))
    for l in download_links:
        if not os.path.exists(os.path.dirname(l[1])):
            os.makedirs(os.path.dirname(l[1]))
        if os.path.exists(l[1]):
            os.remove(l[1])
        if l[1].endswith('.jp2'):
            print 't ... *{b}'.format(b=l[1].split('_')[-1])
        if download_file(l[0], l[1]) is False:
            print 't ... {f} failed to download! Download for this scene is cancelled here!'.format(f=l[0])
            return

### finally do it ###
if __name__ == '__main__':
    key_json = r'PATH/TO/BIGQUERY_KEY_JSON'
    project_id = 'YOUR_PROJECT_ID'
    outdir = r'YOUR/OUTPUT/PATH'
    tile = '55LFL'
    cloud = 20
    start = '2017-02-01'
    end = '2017-02-28'
    scene_list = query_sentinel(key_json, project_id, start, end, tile, cloud)
    for s in scene_list:
        download_sentinel(s, outdir)

thomas84 · Answer

This script worked for me after I removed "-2" in the line online_path within the function make_safe_dirs:
online_path = line[7:line.find('><')]

Edwah Za · Answer

There are about two types of manifest.safe. One is for online_path = line[7:line.find('><') - 2], the other is for online_path = line[7:line.find('><')]. For convenience, I get online_path using xml parser:

#from bs4 import BeautifulSoup as bs

    with open(manifest, 'r') as f: manifest_text = f.read()
    soup = bs(manifest_text,'xml')
    hs  = soup.select('fileLocation')
    download_links = []
    load_this = False
    for h in hs:
        online_path = h['href'][1:]
        tile = scene_name.split('_')[-2]
        if online_path.startswith('/GRANULE/'):
            if '_' + tile + '_' in online_path:
                load_this = True
        else:
            load_this = True
        if load_this:
            local_path = os.path.join(scene_path, *online_path.split('/')[1:])
            online_path = scene + online_path
            download_links.append((online_path, local_path))
        load_this = False

Answered by Edwah Za on May 16, 2021

Downloading Sentinel-2 data from Google Cloud Storage with Python?

3 Answers

Add your own answers!

Ask a Question