From 6378337f5f31d9c3afbcedc95853dd90a1d6bb17 Mon Sep 17 00:00:00 2001 From: Troy Raen Date: Mon, 22 Jul 2024 14:58:29 -0700 Subject: [PATCH 1/2] re-add gcs tutorial --- cloud-storage/README.rst | 128 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 cloud-storage/README.rst diff --git a/cloud-storage/README.rst b/cloud-storage/README.rst new file mode 100644 index 0000000..c8840a7 --- /dev/null +++ b/cloud-storage/README.rst @@ -0,0 +1,128 @@ +.. _cloud storage: + +Cloud Storage Tutorial +============================== + +.. contents:: Table of Contents + :depth: 1 + :local: + +This tutorial covers access via two methods: pittgoogle-client (with some direct use +of the Google Cloud API), and the gsutil CLI. + +Prerequisites +------------- + +Complete [One-Time Setup](https://mwvgroup.github.io/pittgoogle-client/one-time-setup), specifically: + +- Install the `pittgoogle-client` package +- Setup authentication to a Google Cloud project +- Set environment variables +- Enable the Cloud Storage API +- If you want to follow the command-line section in this tutorial, install the command-line tools + +Python +------ + +Setup +~~~~~ + +Imports + +.. code:: python + + import os + from pathlib import Path + + import fastavro + import google.cloud.storage + import pittgoogle + from matplotlib import pyplot as plt + +Name some things + +.. code:: python + + # fill in the path to the local directory to which you want to download files + local_dir = '' + + my_projectid = os.getenv('GOOGLE_CLOUD_PROJECT') + pittgoogle_projectid = pittgoogle.ProjectIds().pittgoogle + +Download files +~~~~~~~~~~~~~~ + +Download alerts for a given objectId. + +.. code:: python + + objectId = 'ZTF19acfixfe' + bucket_name = f'{pittgoogle_projectid}-ztf_alerts_v4_02' + + # Create a client and request a list of files + storage_client = google.cloud.storage.Client(my_projectid) + bucket = storage_client.get_bucket(bucket_name) + blobs = bucket.list_blobs(prefix=objectId) + + # download the files + for blob in blobs: + local_path = f'{local_dir}/{blob.name}' + blob.download_to_filename(local_path) + print(f'Downloaded {local_path}') + +Open a file +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Load to a dict: + +.. code:: python + + paths = Path(local_dir).glob('*.avro') + for path in paths: + with open(path, 'rb') as fin: + alert_list = [r for r in fastavro.reader(fin)] + break + alert_dict = alert_list[0] # extract the single alert packet + + print(alert_dict.keys()) + +Load to a pandas DataFrame: + +.. code:: python + + lightcurve_df = pittgoogle.utils.Cast.alert_dict_to_dataframe(alert_dict) + + +Plot light curves and cutouts +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +See :ref:`ztf figures` + +Command line +------------ + +See also: + +- `Quickstart: Using the gsutil + tool `__ +- `gsutil cp `__ + +Get help + +.. code:: bash + + gsutil help + gsutil help cp + +Download a single file + +.. code:: bash + + # fill in the path to the local directory to which you want to download files + local_dir= + # fill in the name of the file you want. see above for the syntax + file_name= + # file_name=ZTF17aaackje.1563161493315010012.ztf_20210413_programid1.avro + avro_bucket="${pittgoogle_projectid}-ztf-alert_avros" + + gsutil cp "gs://${avro_bucket}/${file_name}" ${local_dir}/. From 2f3072003b348506ba302054b0d834d2105fca02 Mon Sep 17 00:00:00 2001 From: Troy Raen Date: Mon, 22 Jul 2024 14:58:40 -0700 Subject: [PATCH 2/2] re-add bigquery tutorial --- bigquery/README.rst | 402 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 402 insertions(+) create mode 100644 bigquery/README.rst diff --git a/bigquery/README.rst b/bigquery/README.rst new file mode 100644 index 0000000..ff21e73 --- /dev/null +++ b/bigquery/README.rst @@ -0,0 +1,402 @@ +.. _bigquery: + +BigQuery Tutorial +================== + +.. contents:: Table of Contents + :depth: 1 + :local: + +This tutorial covers access via two methods: pittgoogle-client and the bq CLI. + +Prerequisites +------------- + +1. Complete the initial setup. In particular, be sure to: + + - :ref:`install` and/or :ref:`Install the command-line tools `. + - :ref:`service account` + - :ref:`Set your environment variables ` + +Python +------ + +Setup and basics +~~~~~~~~~~~~~~~~ + +Imports + +.. code:: python + + import pittgoogle + import os + +Create a Client for the BigQuery connections below + +.. code:: python + + my_project_id = os.getenv('GOOGLE_CLOUD_PROJECT') + pittgoogle.bigquery.create_client(my_project_id) + +View the available tables and their schemas + +.. code:: python + + # see which tables are available + pittgoogle.bigquery.get_dataset_table_names() + + # look at the schema and basic info of a table + table = 'DIASource' + pittgoogle.bigquery.get_table_info(table) + +Query lightcurves and other history +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Setup + +.. code:: python + + # Choose the history data you want returned + columns = ['jd', 'fid', 'magpsf', 'sigmapsf'] + # 'objectId' and 'candid' will be included automatically + # options are columns in the 'DIASource' table + # pittgoogle.bigquery.get_table_info('DIASource') + + # Optional + # choose specific objects + objectIds = ['ZTF18aczuwfe', 'ZTF18aczvqcr', 'ZTF20acqgklx', 'ZTF18acexdlh'] + # limit to a sample of the table + # limit = 1000 # add this keyword to query_objects() below + +To retrieve lightcurves and other history, we must query for objects' +"DIASource" observations and aggregate the results by ``objectId``. + +``pittgoogle.bigquery.query_objects()`` is a convenience wrapper that let's you +grab all the results at once, or step through them using a generator. +It's options are demonstrated below. + +.. code:: python + + # Option 1: Get a single DataFrame of all results + + lightcurves_df = pittgoogle.bigquery.query_objects(columns, objectIds=objectIds) + # This will execute a dry run and tell you how much data will be processed. + # You will be asked to confirm before proceeding. + # In the future we'll skip this using + dry_run = False + + lightcurves_df.sample(10) + # cleaned of duplicates + +Congratulations! You've now retrieved your first data from the transient +table. It is a DataFrame containing the candidate observations for every +object we requested, indexed by ``objectId`` and ``candid`` (candidate +ID). It includes the columns we requested in the query. + +``fid`` is the filter, mapped to an integer. You can see the filter's +common name in the table schema we looked at earlier, or you can use +``pittgoogle.utils.ztf_fid_names()`` which returns a dictionary of the mapping. + +.. code:: python + + # map fid column to the filter's common name + fid_names = pittgoogle.utils.ztf_fid_names() # dict + print(fid_names) + + lightcurves_df['filter'] = lightcurves_df['fid'].map(fid_names) + lightcurves_df.head() + +Queries can return large datasets. You may want to use a generator to +step through objects individually, and avoid loading the entire dataset +into memory at once. ``query_objects()`` can return one for you: + +.. code:: python + + # Option 2: Get a generator that yields a DataFrame for each objectId + + iterator = True + objects = pittgoogle.bigquery.query_objects( + columns, objectIds=objectIds, iterator=iterator, dry_run=dry_run + ) + # cleaned of duplicates + + for lightcurve_df in objects: + print(f'\nobjectId: {lightcurve_df.objectId}') # objectId in metadata + print(lightcurve_df.sample(5)) + +Each DataFrame contains data on a single object, and is indexed by +``candid``. The ``objectId`` is in the metadata. + +``query_objects()`` can also return a json formatted string of the query +results: + +.. code:: python + + # Option 3: Get a single json string with all the results + + format = 'json' + lcsjson = pittgoogle.bigquery.query_objects( + columns, objectIds=objectIds, format=format, dry_run=dry_run + ) + # cleaned of duplicates + print(lcsjson) + + # read it back in + df = pd.read_json(lcsjson) + df.head() + +.. code:: python + + # Option 4: Get a generator that yields a json string for a single objectId + + format = 'json' + iterator = True + jobj = pittgoogle.bigquery.query_objects( + columns, objectIds=objectIds, format=format, iterator=iterator, dry_run=dry_run + ) + # cleaned of duplicates + + for lcjson in jobj: + print(lcjson) + # lightcurve_df = pd.read_json(lcjson) # read back to a df + +Finally, ``query_objects()`` can return the raw query job object that it +gets from its API call using ``google.cloud.bigquery``'s ``query()`` +method. + +.. code:: python + + # Option 5: Get the `query_job` object + # (see the section on using google.cloud.bigquery directly) + + query_job = pittgoogle.bigquery.query_objects( + columns, objectIds=objectIds, format="query_job", dry_run=dry_run + ) + # query_job is iterable + # each element contains the aggregated history for a single objectId + # Beware: this has not been cleaned of duplicate entries + +.. code:: python + + # Option 5 continued: parse query_job results row by row + + for row in query_job: + # values can be accessed by field name or index + print(f"objectId={row[0]}, magpsf={row['magpsf']}") + + # pgb can cast to a DataFrame or json string + # this option also cleans the duplicates + lightcurve_df = pittgoogle.bigquery.format_history_query_results(row=row) + print(f'\nobjectId: {lightcurve_df.objectId}') # objectId in metadata + print(lightcurve_df.head(1)) + lcjson = pittgoogle.bigquery.format_history_query_results(row=row, format='json') + print('\n', lcjson) + + break + +Plot a lightcurve +^^^^^^^^^^^^^^^^^ + +The following DataFrame can be used with the code in :ref:`ztf figures` to plot the object's light curves. + +.. code:: python + + # Get an object's lightcurve DataFrame with the minimum required columns + columns = ['jd','fid','magpsf','sigmapsf','diffmaglim'] + objectId = 'ZTF20acqgklx' + lightcurve_df = pittgoogle.bigquery.query_objects(columns, objectIds=[objectId], dry_run=False) + +Cone search +~~~~~~~~~~~ + +To perform a cone search, we query for object histories and then check +whether they are within the cone. ``pittgoogle.bigquery.cone_search()`` is a +convenience wrapper provided +for demonstration, but note that it is very inefficient. + +First we set the search parameters. + +.. code:: python + + center = coord.SkyCoord(76.91, 6.02, frame='icrs', unit='deg') + radius = coord.Angle(2, unit=u.deg) + + columns = ['jd', 'fid', 'magpsf', 'sigmapsf'] + # 'objectId' and 'candid' will be included automatically + # options are in the 'DIASource' table + # pittgoogle.bigquery.get_table_info('DIASource') + dry_run = False + + # we'll restrict to a handful of objects to reduce runtime, but this is optional + objectIds = ['ZTF18aczuwfe', 'ZTF18aczvqcr', 'ZTF20acqgklx', 'ZTF18acexdlh'] + +``cone_search()`` has similar options to ``query_objects()``. +Here we demonstrate one. + +.. code:: python + + # Option 1: Get a single df of all objects in the cone + + objects_in_cone = pittgoogle.bigquery.cone_search( + center, radius, columns, objectIds=objectIds, dry_run=dry_run + ) + objects_in_cone.sample(5) + + +-------------- + +Using google.cloud.bigquery +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The previous sections demonstrated convenience wrappers for querying +with ``google.cloud.bigquery``. Here we demonstrate using these tools +directly with some basic examples. View the pgb\_utils source code for +more examples. + +Links to more information: + +- `Query syntax in Standard + SQL `__ +- `google.cloud.bigquery + docs `__ + +Query setup: + +.. code:: python + + # Create a BigQuery Client to handle the connections + bq_client = bigquery.Client(project=my_project_id) + +.. code:: python + + # Write the standard SQL query statement + + # pittgoogle.bigquery.get_dataset_table_names() # view available tables + # pittgoogle.bigquery.get_table_info('') # view available column names + + # construct the full table name + pgb_project_id = 'ardent-cycling-243415' + table = 'salt2' + dataset = 'ztf_alerts' + full_table_name = f'{pgb_project_id}.{dataset}.{table}' + + # construct the query + query = ( + f'SELECT objectId, candid, t0, x0, x1, c, chisq, ndof ' + f'FROM `{full_table_name}` ' + f'WHERE ndof>0 and chisq/ndof<2 ' + ) + + # note: if you want to query object histories you can get the + # query statement using `pittgoogle.bigquery.object_history_sql_statement()` + +.. code:: python + + # Let's create a function to execute a "dry run" + # and tell us how much data will be processed. + # This is essentially `pittgoogle.bigquery.dry_run()` + def dry_run(query): + job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False) + query_job = bq_client.query(query, job_config=job_config) + nbytes, TiB = query_job.total_bytes_processed, 2**40 + pTiB = nbytes/TiB*100 # nbytes as a percent of 1 TiB + print(f'\nQuery statement:') + print(f'\n"{query}"\n') + print(f'will process {nbytes} bytes of data.') + print(f'({pTiB:.3}% of your 1 TiB Free Tier monthly allotment.)') + +.. code:: python + + # Find out how much data will be processed + dry_run(query) + +Query: + +.. code:: python + + # Make the API request + query_job = bq_client.query(query) + # Beware: the results may contain duplicate entries + +Format and view results: + +.. code:: python + + # Option 1: dump results to a pandas.DataFrame + df = query_job.to_dataframe() + + # some things you might want to do with it + df = df.drop_duplicates() + df = df.set_index(['objectId','candid']).sort_index() + + df.hist() + df.head() + +.. code:: python + + # Option 2: parse results row by row + for r, row in enumerate(query_job): + + # row values can be accessed by field name or index + print(f"objectId={row[0]}, t0={row['t0']}") + + if r>5: break + +-------------- + +Command line +------------ + +Links to more information: + +- `Quickstart using the bq command-line + tool `__ +- `Reference of all bq commands and + flags `__ +- `Query syntax in Standard + SQL `__ + +.. code:: bash + + # Get help + bq help query + +.. code:: bash + + # view the schema of a table + bq show --schema --format=prettyjson ardent-cycling-243415:ztf_alerts.DIASource + # bq show --schema --format=prettyjson ardent-cycling-243415:ztf_alerts.alerts + + # Note: The first time you make a call with `bq` you will ask you to + # initialize a .bigqueryrc configuration file. Follow the directions. + +.. code:: bash + + # Query: dry run + + # first we do a dry_run by including the flag --dry_run + bq query \ + --dry_run \ + --use_legacy_sql=false \ + 'SELECT + objectId, candid, t0, x0, x1, c, chisq, ndof + FROM + `ardent-cycling-243415.ztf_alerts.salt2` + WHERE + ndof>0 and chisq/ndof<2 + LIMIT + 10' + +.. code:: bash + + # execute the Query + bq query \ + --use_legacy_sql=false \ + "SELECT + objectId, candid, t0, x0, x1, c, chisq, ndof + FROM + `ardent-cycling-243415.ztf_alerts.salt2` + WHERE + ndof>0 and chisq/ndof<2 + LIMIT + 10"