From 6378337f5f31d9c3afbcedc95853dd90a1d6bb17 Mon Sep 17 00:00:00 2001
From: Troy Raen <raen@ipac.caltech.edu>
Date: Mon, 22 Jul 2024 14:58:29 -0700
Subject: [PATCH 1/2] re-add gcs tutorial

---
 cloud-storage/README.rst | 128 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 128 insertions(+)
 create mode 100644 cloud-storage/README.rst

diff --git a/cloud-storage/README.rst b/cloud-storage/README.rst
new file mode 100644
index 0000000..c8840a7
--- /dev/null
+++ b/cloud-storage/README.rst
@@ -0,0 +1,128 @@
+.. _cloud storage:
+
+Cloud Storage Tutorial
+==============================
+
+.. contents:: Table of Contents
+    :depth: 1
+    :local:
+
+This tutorial covers access via two methods: pittgoogle-client (with some direct use
+of the Google Cloud API), and the gsutil CLI.
+
+Prerequisites
+-------------
+
+Complete [One-Time Setup](https://mwvgroup.github.io/pittgoogle-client/one-time-setup), specifically:
+
+- Install the `pittgoogle-client` package
+- Setup authentication to a Google Cloud project
+- Set environment variables
+- Enable the Cloud Storage API
+- If you want to follow the command-line section in this tutorial, install the command-line tools
+
+Python
+------
+
+Setup
+~~~~~
+
+Imports
+
+.. code:: python
+
+    import os
+    from pathlib import Path
+
+    import fastavro
+    import google.cloud.storage
+    import pittgoogle
+    from matplotlib import pyplot as plt
+
+Name some things
+
+.. code:: python
+
+    # fill in the path to the local directory to which you want to download files
+    local_dir = ''
+
+    my_projectid = os.getenv('GOOGLE_CLOUD_PROJECT')
+    pittgoogle_projectid = pittgoogle.ProjectIds().pittgoogle
+
+Download files
+~~~~~~~~~~~~~~
+
+Download alerts for a given objectId.
+
+.. code:: python
+
+    objectId = 'ZTF19acfixfe'
+    bucket_name = f'{pittgoogle_projectid}-ztf_alerts_v4_02'
+
+    # Create a client and request a list of files
+    storage_client = google.cloud.storage.Client(my_projectid)
+    bucket = storage_client.get_bucket(bucket_name)
+    blobs = bucket.list_blobs(prefix=objectId)
+
+    # download the files
+    for blob in blobs:
+        local_path = f'{local_dir}/{blob.name}'
+        blob.download_to_filename(local_path)
+        print(f'Downloaded {local_path}')
+
+Open a file
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Load to a dict:
+
+.. code:: python
+
+    paths = Path(local_dir).glob('*.avro')
+    for path in paths:
+        with open(path, 'rb') as fin:
+            alert_list = [r for r in fastavro.reader(fin)]
+        break
+    alert_dict = alert_list[0]  # extract the single alert packet
+
+    print(alert_dict.keys())
+
+Load to a pandas DataFrame:
+
+.. code:: python
+
+    lightcurve_df = pittgoogle.utils.Cast.alert_dict_to_dataframe(alert_dict)
+
+
+Plot light curves and cutouts
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+See :ref:`ztf figures`
+
+Command line
+------------
+
+See also:
+
+-   `Quickstart: Using the gsutil
+    tool <https://cloud.google.com/storage/docs/quickstart-gsutil>`__
+-   `gsutil cp <https://cloud.google.com/storage/docs/gsutil/commands/cp>`__
+
+Get help
+
+.. code:: bash
+
+    gsutil help
+    gsutil help cp
+
+Download a single file
+
+.. code:: bash
+
+    # fill in the path to the local directory to which you want to download files
+    local_dir=
+    # fill in the name of the file you want. see above for the syntax
+    file_name=
+    # file_name=ZTF17aaackje.1563161493315010012.ztf_20210413_programid1.avro
+    avro_bucket="${pittgoogle_projectid}-ztf-alert_avros"
+
+    gsutil cp "gs://${avro_bucket}/${file_name}" ${local_dir}/.

From 2f3072003b348506ba302054b0d834d2105fca02 Mon Sep 17 00:00:00 2001
From: Troy Raen <raen@ipac.caltech.edu>
Date: Mon, 22 Jul 2024 14:58:40 -0700
Subject: [PATCH 2/2] re-add bigquery tutorial

---
 bigquery/README.rst | 402 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 402 insertions(+)
 create mode 100644 bigquery/README.rst

diff --git a/bigquery/README.rst b/bigquery/README.rst
new file mode 100644
index 0000000..ff21e73
--- /dev/null
+++ b/bigquery/README.rst
@@ -0,0 +1,402 @@
+.. _bigquery:
+
+BigQuery Tutorial
+==================
+
+.. contents:: Table of Contents
+    :depth: 1
+    :local:
+
+This tutorial covers access via two methods: pittgoogle-client and the bq CLI.
+
+Prerequisites
+-------------
+
+1. Complete the initial setup. In particular, be sure to:
+
+   -  :ref:`install` and/or :ref:`Install the command-line tools <install gcp cli>`.
+   -  :ref:`service account`
+   -  :ref:`Set your environment variables <set env vars>`
+
+Python
+------
+
+Setup and basics
+~~~~~~~~~~~~~~~~
+
+Imports
+
+.. code:: python
+
+    import pittgoogle
+    import os
+
+Create a Client for the BigQuery connections below
+
+.. code:: python
+
+    my_project_id = os.getenv('GOOGLE_CLOUD_PROJECT')
+    pittgoogle.bigquery.create_client(my_project_id)
+
+View the available tables and their schemas
+
+.. code:: python
+
+    # see which tables are available
+    pittgoogle.bigquery.get_dataset_table_names()
+
+    # look at the schema and basic info of a table
+    table = 'DIASource'
+    pittgoogle.bigquery.get_table_info(table)
+
+Query lightcurves and other history
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Setup
+
+.. code:: python
+
+    # Choose the history data you want returned
+    columns = ['jd', 'fid', 'magpsf', 'sigmapsf']
+    # 'objectId' and 'candid' will be included automatically
+    # options are columns in the 'DIASource' table
+    # pittgoogle.bigquery.get_table_info('DIASource')
+
+    # Optional
+    # choose specific objects
+    objectIds = ['ZTF18aczuwfe', 'ZTF18aczvqcr', 'ZTF20acqgklx', 'ZTF18acexdlh']
+    # limit to a sample of the table
+    # limit = 1000  # add this keyword to query_objects() below
+
+To retrieve lightcurves and other history, we must query for objects'
+"DIASource" observations and aggregate the results by ``objectId``.
+
+``pittgoogle.bigquery.query_objects()`` is a convenience wrapper that let's you
+grab all the results at once, or step through them using a generator.
+It's options are demonstrated below.
+
+.. code:: python
+
+    # Option 1: Get a single DataFrame of all results
+
+    lightcurves_df = pittgoogle.bigquery.query_objects(columns, objectIds=objectIds)
+    # This will execute a dry run and tell you how much data will be processed.
+    # You will be asked to confirm before proceeding.
+    # In the future we'll skip this using
+    dry_run = False
+
+    lightcurves_df.sample(10)
+    # cleaned of duplicates
+
+Congratulations! You've now retrieved your first data from the transient
+table. It is a DataFrame containing the candidate observations for every
+object we requested, indexed by ``objectId`` and ``candid`` (candidate
+ID). It includes the columns we requested in the query.
+
+``fid`` is the filter, mapped to an integer. You can see the filter's
+common name in the table schema we looked at earlier, or you can use
+``pittgoogle.utils.ztf_fid_names()`` which returns a dictionary of the mapping.
+
+.. code:: python
+
+    # map fid column to the filter's common name
+    fid_names = pittgoogle.utils.ztf_fid_names()  # dict
+    print(fid_names)
+
+    lightcurves_df['filter'] = lightcurves_df['fid'].map(fid_names)
+    lightcurves_df.head()
+
+Queries can return large datasets. You may want to use a generator to
+step through objects individually, and avoid loading the entire dataset
+into memory at once. ``query_objects()`` can return one for you:
+
+.. code:: python
+
+    # Option 2: Get a generator that yields a DataFrame for each objectId
+
+    iterator = True
+    objects = pittgoogle.bigquery.query_objects(
+        columns, objectIds=objectIds, iterator=iterator, dry_run=dry_run
+    )
+    # cleaned of duplicates
+
+    for lightcurve_df in objects:
+        print(f'\nobjectId: {lightcurve_df.objectId}')  # objectId in metadata
+        print(lightcurve_df.sample(5))
+
+Each DataFrame contains data on a single object, and is indexed by
+``candid``. The ``objectId`` is in the metadata.
+
+``query_objects()`` can also return a json formatted string of the query
+results:
+
+.. code:: python
+
+    # Option 3: Get a single json string with all the results
+
+    format = 'json'
+    lcsjson = pittgoogle.bigquery.query_objects(
+        columns, objectIds=objectIds, format=format, dry_run=dry_run
+    )
+    # cleaned of duplicates
+    print(lcsjson)
+
+    # read it back in
+    df = pd.read_json(lcsjson)
+    df.head()
+
+.. code:: python
+
+    # Option 4: Get a generator that yields a json string for a single objectId
+
+    format = 'json'
+    iterator = True
+    jobj = pittgoogle.bigquery.query_objects(
+        columns, objectIds=objectIds, format=format, iterator=iterator, dry_run=dry_run
+    )
+    # cleaned of duplicates
+
+    for lcjson in jobj:
+        print(lcjson)
+        # lightcurve_df = pd.read_json(lcjson)  # read back to a df
+
+Finally, ``query_objects()`` can return the raw query job object that it
+gets from its API call using ``google.cloud.bigquery``'s ``query()``
+method.
+
+.. code:: python
+
+    # Option 5: Get the `query_job` object
+    #           (see the section on using google.cloud.bigquery directly)
+
+    query_job = pittgoogle.bigquery.query_objects(
+        columns, objectIds=objectIds, format="query_job", dry_run=dry_run
+    )
+    # query_job is iterable
+    # each element contains the aggregated history for a single objectId
+    # Beware: this has not been cleaned of duplicate entries
+
+.. code:: python
+
+    # Option 5 continued: parse query_job results row by row
+
+    for row in query_job:
+        # values can be accessed by field name or index
+        print(f"objectId={row[0]}, magpsf={row['magpsf']}")
+
+        # pgb can cast to a DataFrame or json string
+        # this option also cleans the duplicates
+        lightcurve_df = pittgoogle.bigquery.format_history_query_results(row=row)
+        print(f'\nobjectId: {lightcurve_df.objectId}')  # objectId in metadata
+        print(lightcurve_df.head(1))
+        lcjson = pittgoogle.bigquery.format_history_query_results(row=row, format='json')
+        print('\n', lcjson)
+
+        break
+
+Plot a lightcurve
+^^^^^^^^^^^^^^^^^
+
+The following DataFrame can be used with the code in :ref:`ztf figures` to plot the object's light curves.
+
+.. code:: python
+
+    # Get an object's lightcurve DataFrame with the minimum required columns
+    columns = ['jd','fid','magpsf','sigmapsf','diffmaglim']
+    objectId = 'ZTF20acqgklx'
+    lightcurve_df = pittgoogle.bigquery.query_objects(columns, objectIds=[objectId], dry_run=False)
+
+Cone search
+~~~~~~~~~~~
+
+To perform a cone search, we query for object histories and then check
+whether they are within the cone. ``pittgoogle.bigquery.cone_search()`` is a
+convenience wrapper provided
+for demonstration, but note that it is very inefficient.
+
+First we set the search parameters.
+
+.. code:: python
+
+    center = coord.SkyCoord(76.91, 6.02, frame='icrs', unit='deg')
+    radius = coord.Angle(2, unit=u.deg)
+
+    columns = ['jd', 'fid', 'magpsf', 'sigmapsf']
+    # 'objectId' and 'candid' will be included automatically
+    # options are in the 'DIASource' table
+    # pittgoogle.bigquery.get_table_info('DIASource')
+    dry_run = False
+
+    # we'll restrict to a handful of objects to reduce runtime, but this is optional
+    objectIds = ['ZTF18aczuwfe', 'ZTF18aczvqcr', 'ZTF20acqgklx', 'ZTF18acexdlh']
+
+``cone_search()`` has similar options to ``query_objects()``.
+Here we demonstrate one.
+
+.. code:: python
+
+    # Option 1: Get a single df of all objects in the cone
+
+    objects_in_cone = pittgoogle.bigquery.cone_search(
+        center, radius, columns, objectIds=objectIds, dry_run=dry_run
+    )
+    objects_in_cone.sample(5)
+
+
+--------------
+
+Using google.cloud.bigquery
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The previous sections demonstrated convenience wrappers for querying
+with ``google.cloud.bigquery``. Here we demonstrate using these tools
+directly with some basic examples. View the pgb\_utils source code for
+more examples.
+
+Links to more information:
+
+-   `Query syntax in Standard
+    SQL <https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax>`__
+-   `google.cloud.bigquery
+    docs <https://googleapis.dev/python/bigquery/latest/index.html>`__
+
+Query setup:
+
+.. code:: python
+
+    # Create a BigQuery Client to handle the connections
+    bq_client = bigquery.Client(project=my_project_id)
+
+.. code:: python
+
+    # Write the standard SQL query statement
+
+    # pittgoogle.bigquery.get_dataset_table_names()  # view available tables
+    # pittgoogle.bigquery.get_table_info('<table>')  # view available column names
+
+    # construct the full table name
+    pgb_project_id = 'ardent-cycling-243415'
+    table = 'salt2'
+    dataset = 'ztf_alerts'
+    full_table_name = f'{pgb_project_id}.{dataset}.{table}'
+
+    # construct the query
+    query = (
+        f'SELECT objectId, candid, t0, x0, x1, c, chisq, ndof '
+        f'FROM `{full_table_name}` '
+        f'WHERE ndof>0 and chisq/ndof<2 '
+    )
+
+    # note: if you want to query object histories you can get the
+    # query statement using `pittgoogle.bigquery.object_history_sql_statement()`
+
+.. code:: python
+
+    # Let's create a function to execute a "dry run"
+    # and tell us how much data will be processed.
+    # This is essentially `pittgoogle.bigquery.dry_run()`
+    def dry_run(query):
+        job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
+        query_job = bq_client.query(query, job_config=job_config)
+        nbytes, TiB = query_job.total_bytes_processed, 2**40
+        pTiB = nbytes/TiB*100  # nbytes as a percent of 1 TiB
+        print(f'\nQuery statement:')
+        print(f'\n"{query}"\n')
+        print(f'will process {nbytes} bytes of data.')
+        print(f'({pTiB:.3}% of your 1 TiB Free Tier monthly allotment.)')
+
+.. code:: python
+
+    # Find out how much data will be processed
+    dry_run(query)
+
+Query:
+
+.. code:: python
+
+    # Make the API request
+    query_job = bq_client.query(query)
+    # Beware: the results may contain duplicate entries
+
+Format and view results:
+
+.. code:: python
+
+    # Option 1: dump results to a pandas.DataFrame
+    df = query_job.to_dataframe()
+
+    # some things you might want to do with it
+    df = df.drop_duplicates()
+    df = df.set_index(['objectId','candid']).sort_index()
+
+    df.hist()
+    df.head()
+
+.. code:: python
+
+    # Option 2: parse results row by row
+    for r, row in enumerate(query_job):
+
+        # row values can be accessed by field name or index
+        print(f"objectId={row[0]}, t0={row['t0']}")
+
+        if r>5: break
+
+--------------
+
+Command line
+------------
+
+Links to more information:
+
+-   `Quickstart using the bq command-line
+    tool <https://cloud.google.com/bigquery/docs/quickstarts/quickstart-command-line>`__
+-   `Reference of all bq commands and
+    flags <https://cloud.google.com/bigquery/docs/reference/bq-cli-reference>`__
+-   `Query syntax in Standard
+    SQL <https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax>`__
+
+.. code:: bash
+
+    # Get help
+    bq help query
+
+.. code:: bash
+
+    # view the schema of a table
+    bq show --schema --format=prettyjson ardent-cycling-243415:ztf_alerts.DIASource
+    # bq show --schema --format=prettyjson ardent-cycling-243415:ztf_alerts.alerts
+
+    # Note: The first time you make a call with `bq` you will ask you to
+    # initialize a .bigqueryrc configuration file. Follow the directions.
+
+.. code:: bash
+
+    # Query: dry run
+
+    # first we do a dry_run by including the flag --dry_run
+    bq query \
+    --dry_run \
+    --use_legacy_sql=false \
+    'SELECT
+        objectId, candid, t0, x0, x1, c, chisq, ndof
+    FROM
+        `ardent-cycling-243415.ztf_alerts.salt2`
+    WHERE
+        ndof>0 and chisq/ndof<2
+    LIMIT
+        10'
+
+.. code:: bash
+
+    # execute the Query
+    bq query \
+    --use_legacy_sql=false \
+    "SELECT
+        objectId, candid, t0, x0, x1, c, chisq, ndof
+    FROM
+        `ardent-cycling-243415.ztf_alerts.salt2`
+    WHERE
+        ndof>0 and chisq/ndof<2
+    LIMIT
+        10"