Merge pull request #83 from STRIDES/add-extramural-signup

added back tutorials dir to fix broken links in ms email flows
STRIDES · Mar 22, 2024 · aca5975 · aca5975
2 parents 51ea2bc + 499b42b
commit aca5975
Show file tree

Hide file tree

Showing 38 changed files with 8,999 additions and 0 deletions.
diff --git a/tutorials/README.md b/tutorials/README.md
diff --git a/tutorials/notebooks/GWAS/GWAS_coat_color.ipynb b/tutorials/notebooks/GWAS/GWAS_coat_color.ipynb
diff --git a/tutorials/notebooks/GenAI/Azure_AI_Studio_README.md b/tutorials/notebooks/GenAI/Azure_AI_Studio_README.md
diff --git a/tutorials/notebooks/GenAI/Azure_Open_AI_README.md b/tutorials/notebooks/GenAI/Azure_Open_AI_README.md
diff --git a/tutorials/notebooks/GenAI/LICENSE b/tutorials/notebooks/GenAI/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) Microsoft Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/tutorials/notebooks/GenAI/embedding_demos/acs_embeddings.py b/tutorials/notebooks/GenAI/embedding_demos/acs_embeddings.py
@@ -0,0 +1,79 @@
+from langchain.retrievers import AzureCognitiveSearchRetriever
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.vectorstores import FAISS
+from langchain.chains import RetrievalQA
+from langchain.chat_models import AzureChatOpenAI
+from PIL import Image
+import os
+import streamlit as st
+from dotenv import load_dotenv
+
+# load in .env variables 
+load_dotenv()
+
+def config_keys():
+    # set api keys for AOAI and Azure Search
+    os.environ['OPENAI_API_VERSION'] = os.getenv('AZURE_OPENAI_VERSION')
+    os.environ['OPENAI_API_KEY'] = os.getenv('AZURE_OPENAI_KEY')
+    os.environ['OPENAI_API_BASE'] = os.getenv('AZURE_OPENAI_ENDPOINT') 
+    os.environ['OPENAI_EMBEDDING_DEPLOYMENT_NAME'] = os.getenv('AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME')
+    os.environ['AZURE_COGNITIVE_SEARCH_SERVICE_NAME'] = os.getenv('AZURE_COGNITIVE_SEARCH_SERVICE_NAME')
+    os.environ['AZURE_COGNITIVE_SEARCH_API_KEY'] = os.getenv('AZURE_COGNITIVE_SEARCH_API_KEY')
+    os.environ['AZURE_COGNITIVE_SEARCH_INDEX_NAME'] = os.getenv('AZURE_COGNITIVE_SEARCH_INDEX_NAME')
+
+
+def main():
+    # Streamlit config 
+    st.title("Demo - Azure OpenAI & Cognitive Search Embeddings")
+    image = Image.open('image_logo2.png')
+    st.image(image, caption = '')
+    st.write('This program is designed to chat over your files in Azure Cognitive Search. \
+                Be specific and clear with the questions you ask. \
+                    Welcome to CHATGPT over your own data !!')
+    if 'generated' not in st.session_state:
+        st.session_state.generated = []    
+    if 'past' not in st.session_state:
+        st.session_state.past = []  
+
+    # create your LLM and embeddings. Will be conifuring 'azure' in the openai_api_type parameter.
+    llm = AzureChatOpenAI(  
+                                deployment_name = "gpt-35-turbo",  
+                                openai_api_type = "azure",  
+                                model = "gpt-35-turbo",  
+                                temperature=0.7, 
+                                max_tokens=200
+                                ) 
+
+    embeddings = OpenAIEmbeddings(chunk_size=1, openai_api_type="azure") 
+
+    # ask for the user query 
+    query = st.text_input("Enter a search query: ", key='search_term', placeholder="")
+
+    if query:
+        st.session_state.past.append(query)
+
+        # set up Azure Cognitive Search to retrieve documents
+        # top_k = 1: we only want first related doc
+        retriever = AzureCognitiveSearchRetriever(content_key="content", top_k=1)
+
+        # get the relevant document from Azure Cognitive Search that are only relevant to the query being asked
+        docs = retriever.get_relevant_documents(query)
+
+        # create embedding from the document retrieved and place in a FAISS vector database
+        db = FAISS.from_documents(documents=docs, embedding=embeddings)
+
+        # set up the chain that will feed the retrieved document to the LLM
+        chain = RetrievalQA.from_chain_type(llm=llm, retriever = db.as_retriever(), chain_type="stuff")
+
+        # run the chain on the query asked
+        response = chain.run(query)
+        st.session_state.generated.append(response)
+
+        with st.expander('Vector Search'):
+            for i in range(len(st.session_state.generated)-1, -1, -1):
+                st.info(st.session_state.past[i])
+                st.success(st.session_state.generated[i])
+
+if __name__ == '__main__':
+    config_keys()
+    main()
diff --git a/tutorials/notebooks/GenAI/embedding_demos/aoai_embeddings.py b/tutorials/notebooks/GenAI/embedding_demos/aoai_embeddings.py
@@ -0,0 +1,102 @@
+import openai
+from openai.embeddings_utils import get_embedding, cosine_similarity # must pip install openai[embeddings]
+import pandas as pd
+import numpy as np
+import os
+import streamlit as st
+import time
+from PIL import Image
+from dotenv import load_dotenv
+
+# load in .env variables
+load_dotenv()
+
+# configure azure openai keys
+openai.api_type = 'azure'
+openai.api_version = os.environ['AZURE_OPENAI_VERSION']
+openai.api_base = os.environ['AZURE_OPENAI_ENDPOINT']
+openai.api_key = os.environ['AZURE_OPENAI_KEY']
+
+def embedding_create():
+    # acquire the filename to be embed
+    st.subheader("Vector Creation")
+    st.write('This program is designed to embed your pre-chunked .csv file. \
+                By accomplishing this task, you will be able to chat over all cotent in your .csv via vector searching. \
+                    Just enter the file and the program will take care of the rest (specify file path if not in this directory). \
+                        Welcome to CHATGPT over your own data !!')
+    filename = st.text_input("Enter a file: ", key='filename', value="")
+
+    # start the embeddings process if filename provided
+    if filename:
+
+        # read the data file to be embed 
+        df = pd.read_csv('C:\\src\\AzureOpenAI_Gov_Workshop\\' + filename)
+        st.write(df)
+
+        # calculate word embeddings 
+        df['embedding'] = df['text'].apply(lambda x:get_embedding(x, engine='text-embedding-ada-002'))
+        df.to_csv('C:\\src\\AzureOpenAI_Gov_Workshop\\microsoft-earnings_embeddings.csv')
+        time.sleep(3)
+        st.subheader("Post Embedding")
+        st.success('Embeddings Created Sucessfully!!')
+        st.write(df)
+
+
+def embeddings_search():
+
+    # Streamlit configuration
+    st.subheader("Vector Search")
+    st.write('This program is designed to chat over your vector stored (embedding) .csv file. \
+                This Chat Bot works alongside the "Embeddings Bot" Chat Bot. \
+                    Be specific with the information you want to obtain over your data. \
+                        Welcome to CHATGPT over your own data !!')
+    if 'answer' not in st.session_state:
+        st.session_state.answer = []  
+    if 'score' not in st.session_state:
+        st.session_state.score = []     
+    if 'past' not in st.session_state:
+        st.session_state.past = []  
+
+    # read in the embeddings .csv 
+    # convert elements in 'embedding' column back to numpy array
+    df = pd.read_csv('C:\\src\\AzureOpenAI_Gov_Workshop\\microsoft-earnings_embeddings.csv')
+    df['embedding'] = df['embedding'].apply(eval).apply(np.array)
+
+    # caluculate user query embedding 
+    search_term = st.text_input("Enter a search query: ", key='search_term', placeholder="")
+    if search_term:
+        st.session_state.past.append(search_term)
+        search_term_vector = get_embedding(search_term, engine='text-embedding-ada-002')
+
+        # find similiarity between query and vectors 
+        df['similarities'] = df['embedding'].apply(lambda x:cosine_similarity(x, search_term_vector))
+        df1 = df.sort_values("similarities", ascending=False).head(5)
+
+        # output the response 
+        answer = df1['text'].loc[df1.index[0]]
+        score = df1['similarities'].loc[df1.index[0]]
+        st.session_state.answer.append(answer)
+        st.session_state.score.append(score)
+        with st.expander('Vector Search'):
+            for i in range(len(st.session_state.answer)-1, -1, -1):
+                st.info(st.session_state.past[i])
+                st.write(st.session_state.answer[i])
+                st.write('Score: ', st.session_state.score[i])
+
+
+def main():
+    # Streamlit config
+    st.title("Demo-Azure OpenAI Embeddings")
+    image = Image.open('image_logo2.png')
+    st.image(image, caption = '')
+    st.sidebar.title('Chat Bot Type Selection')
+    chat_style = st.sidebar.selectbox(
+        'Choose between Embeddings Bot or Search Bot', ['Embeddings Bot','Search Bot']
+    )
+    if chat_style == 'Embeddings Bot':
+        embedding_create()
+    elif chat_style == 'Search Bot':
+        embeddings_search()
+
+if __name__ == '__main__':
+    main()
diff --git a/tutorials/notebooks/GenAI/example_scripts/example_azureaisearch_openaichat_zeroshot.py b/tutorials/notebooks/GenAI/example_scripts/example_azureaisearch_openaichat_zeroshot.py
@@ -0,0 +1,99 @@
+from langchain.chains import ConversationalRetrievalChain
+from langchain.prompts import PromptTemplate
+from langchain_community.retrievers import AzureCognitiveSearchRetriever
+from langchain_openai import AzureChatOpenAI
+import sys
+import json
+import os
+
+
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKCYAN = '\033[96m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+
+MAX_HISTORY_LENGTH = 1
+
+def build_chain():
+
+    os.getenv("AZURE_OPENAI_API_KEY")
+    os.getenv("AZURE_OPENAI_ENDPOINT")
+    os.getenv("AZURE_COGNITIVE_SEARCH_SERVICE_NAME")
+    os.getenv("AZURE_COGNITIVE_SEARCH_INDEX_NAME")
+    os.getenv("AZURE_COGNITIVE_SEARCH_API_KEY")
+    AZURE_OPENAI_DEPLOYMENT_NAME = os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"]
+
+    llm = AzureChatOpenAI(
+    openai_api_version="2023-05-15",
+    azure_deployment=AZURE_OPENAI_DEPLOYMENT_NAME,
+    #max_tokens = 3000
+)
+
+    retriever = AzureCognitiveSearchRetriever(content_key="content", top_k=2)
+
+
+    prompt_template = """
+      Instructions:
+      I will provide you question and scientific documents you will answer my question with information from documents in English, and you will create a cumulative summary that should be concise and should accurately. 
+      You should not include any personal opinions or interpretations in your summary, but rather focus on objectively presenting the information from the papers. 
+      Your summary should be written in your own words and ensure that your summary is clear, and concise.
+
+      {question} Answer "don't know" if not present in the documents. 
+      {context}
+      Solution:"""
+
+
+    PROMPT = PromptTemplate(
+        template=prompt_template, input_variables=["context", "question"],
+    )
+
+    condense_qa_template = """
+    Chat History:
+    {chat_history}
+    Here is a new question for you: {question}
+    Standalone question:"""
+    standalone_question_prompt = PromptTemplate.from_template(condense_qa_template)
+
+    qa = ConversationalRetrievalChain.from_llm(
+        llm=llm, 
+        retriever=retriever, 
+        condense_question_prompt=standalone_question_prompt, 
+        return_source_documents=True, 
+        combine_docs_chain_kwargs={"prompt":PROMPT}
+        )
+    return qa
+
+def run_chain(chain, prompt: str, history=[]):
+    print(prompt)
+    return chain({"question": prompt, "chat_history": history})
+
+if __name__ == "__main__":
+  chat_history = []
+  qa = build_chain()
+  print(bcolors.OKBLUE + "Hello! How can I help you?" + bcolors.ENDC)
+  print(bcolors.OKCYAN + "Ask a question, start a New search: or CTRL-D to exit." + bcolors.ENDC)
+  print(">", end=" ", flush=True)
+  for query in sys.stdin:
+    if (query.strip().lower().startswith("new search:")):
+      query = query.strip().lower().replace("new search:","")
+      chat_history = []
+    elif (len(chat_history) == MAX_HISTORY_LENGTH):
+      chat_history.pop(0)
+    result = run_chain(qa, query, chat_history)
+    chat_history.append((query, result["answer"]))
+    print(bcolors.OKGREEN + result['answer'] + bcolors.ENDC)
+    if 'source_documents' in result:
+      print(bcolors.OKGREEN + 'Sources:')
+      for d in result['source_documents']:
+        dict_meta=json.loads(d.metadata['metadata'])
+        print(dict_meta['source'])
+    print(bcolors.ENDC)
+    print(bcolors.OKCYAN + "Ask a question, start a New search: or CTRL-D to exit." + bcolors.ENDC)
+    print(">", end=" ", flush=True)
+  print(bcolors.OKBLUE + "Bye" + bcolors.ENDC)
diff --git a/tutorials/notebooks/GenAI/example_scripts/example_langchain_openaichat_zeroshot.py b/tutorials/notebooks/GenAI/example_scripts/example_langchain_openaichat_zeroshot.py
@@ -0,0 +1,93 @@
+from langchain.retrievers import PubMedRetriever
+from langchain.chains import ConversationalRetrievalChain
+from langchain.prompts import PromptTemplate
+import sys
+import json
+import os
+
+
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKCYAN = '\033[96m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+
+MAX_HISTORY_LENGTH = 1
+
+def build_chain():
+    os.getenv("AZURE_OPENAI_API_KEY")
+    os.getenv("AZURE_OPENAI_ENDPOINT")
+    AZURE_OPENAI_DEPLOYMENT_NAME = os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"]
+
+    llm = AzureChatOpenAI(
+    openai_api_version="2023-05-15",
+    azure_deployment=AZURE_OPENAI_DEPLOYMENT_NAME,
+    #max_tokens = 3000
+)
+
+    retriever= PubMedRetriever()
+
+    prompt_template = """
+    Ignore everything before.
+    Instructions:
+    I will provide you with research papers on a specific topic in English, and you will create a cumulative summary. 
+    The summary should be concise and should accurately and objectively communicate the takeaway of the papers related to the topic. 
+    You should not include any personal opinions or interpretations in your summary, but rather focus on objectively presenting the information from the papers. 
+    Your summary should be written in your own words and ensure that your summary is clear, concise, and accurately reflects the content of the original papers. First, provide a concise summary then citations at the end.
+    {question} Answer "don't know" if not present in the document. 
+    {context}
+    Solution:"""
+
+
+    PROMPT = PromptTemplate(
+        template=prompt_template, input_variables=["context", "question"],
+    )
+
+    condense_qa_template = """
+    Chat History:
+    {chat_history}
+    Here is a new question for you: {question}
+    Standalone question:"""
+    standalone_question_prompt = PromptTemplate.from_template(condense_qa_template)
+
+    qa = ConversationalRetrievalChain.from_llm(
+        llm=llm, 
+        retriever=retriever, 
+        condense_question_prompt=standalone_question_prompt, 
+        return_source_documents=True, 
+        combine_docs_chain_kwargs={"prompt":PROMPT},
+        )
+    return qa
+
+def run_chain(chain, prompt: str, history=[]):
+    print(prompt)
+    return chain({"question": prompt, "chat_history": history})
+
+if __name__ == "__main__":
+  chat_history = []
+  qa = build_chain()
+  print(bcolors.OKBLUE + "Hello! How can I help you?" + bcolors.ENDC)
+  print(bcolors.OKCYAN + "Ask a question, start a New search: or CTRL-D to exit." + bcolors.ENDC)
+  print(">", end=" ", flush=True)
+  for query in sys.stdin:
+    if (query.strip().lower().startswith("new search:")):
+      query = query.strip().lower().replace("new search:","")
+      chat_history = []
+    elif (len(chat_history) == MAX_HISTORY_LENGTH):
+      chat_history.pop(0)
+    result = run_chain(qa, query, chat_history)
+    chat_history.append((query, result["answer"]))
+    print(bcolors.OKGREEN + result['answer'] + bcolors.ENDC) 
+    if 'source_documents' in result:
+        print(bcolors.OKGREEN + 'Sources:')
+        for idx, ref in enumerate(result["source_documents"]):
+            print("PubMed UID: "+ref.metadata["uid"])
+    print(bcolors.ENDC)
+    print(bcolors.OKCYAN + "Ask a question, start a New search: or CTRL-D to exit." + bcolors.ENDC)
+    print(">", end=" ", flush=True)
+  print(bcolors.OKBLUE + "Bye" + bcolors.ENDC)