Skip to content

Commit

Permalink
Merge pull request #83 from STRIDES/add-extramural-signup
Browse files Browse the repository at this point in the history
added back tutorials dir to fix broken links in ms email flows
  • Loading branch information
zbyosufzai authored Mar 22, 2024
2 parents 51ea2bc + 499b42b commit aca5975
Show file tree
Hide file tree
Showing 38 changed files with 8,999 additions and 0 deletions.
91 changes: 91 additions & 0 deletions tutorials/README.md

Large diffs are not rendered by default.

583 changes: 583 additions & 0 deletions tutorials/notebooks/GWAS/GWAS_coat_color.ipynb

Large diffs are not rendered by default.

436 changes: 436 additions & 0 deletions tutorials/notebooks/GenAI/Azure_AI_Studio_README.md

Large diffs are not rendered by default.

395 changes: 395 additions & 0 deletions tutorials/notebooks/GenAI/Azure_Open_AI_README.md

Large diffs are not rendered by default.

21 changes: 21 additions & 0 deletions tutorials/notebooks/GenAI/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) Microsoft Corporation

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
79 changes: 79 additions & 0 deletions tutorials/notebooks/GenAI/embedding_demos/acs_embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from langchain.retrievers import AzureCognitiveSearchRetriever
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.chat_models import AzureChatOpenAI
from PIL import Image
import os
import streamlit as st
from dotenv import load_dotenv

# load in .env variables
load_dotenv()

def config_keys():
# set api keys for AOAI and Azure Search
os.environ['OPENAI_API_VERSION'] = os.getenv('AZURE_OPENAI_VERSION')
os.environ['OPENAI_API_KEY'] = os.getenv('AZURE_OPENAI_KEY')
os.environ['OPENAI_API_BASE'] = os.getenv('AZURE_OPENAI_ENDPOINT')
os.environ['OPENAI_EMBEDDING_DEPLOYMENT_NAME'] = os.getenv('AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME')
os.environ['AZURE_COGNITIVE_SEARCH_SERVICE_NAME'] = os.getenv('AZURE_COGNITIVE_SEARCH_SERVICE_NAME')
os.environ['AZURE_COGNITIVE_SEARCH_API_KEY'] = os.getenv('AZURE_COGNITIVE_SEARCH_API_KEY')
os.environ['AZURE_COGNITIVE_SEARCH_INDEX_NAME'] = os.getenv('AZURE_COGNITIVE_SEARCH_INDEX_NAME')


def main():
# Streamlit config
st.title("Demo - Azure OpenAI & Cognitive Search Embeddings")
image = Image.open('image_logo2.png')
st.image(image, caption = '')
st.write('This program is designed to chat over your files in Azure Cognitive Search. \
Be specific and clear with the questions you ask. \
Welcome to CHATGPT over your own data !!')
if 'generated' not in st.session_state:
st.session_state.generated = []
if 'past' not in st.session_state:
st.session_state.past = []

# create your LLM and embeddings. Will be conifuring 'azure' in the openai_api_type parameter.
llm = AzureChatOpenAI(
deployment_name = "gpt-35-turbo",
openai_api_type = "azure",
model = "gpt-35-turbo",
temperature=0.7,
max_tokens=200
)

embeddings = OpenAIEmbeddings(chunk_size=1, openai_api_type="azure")

# ask for the user query
query = st.text_input("Enter a search query: ", key='search_term', placeholder="")

if query:
st.session_state.past.append(query)

# set up Azure Cognitive Search to retrieve documents
# top_k = 1: we only want first related doc
retriever = AzureCognitiveSearchRetriever(content_key="content", top_k=1)

# get the relevant document from Azure Cognitive Search that are only relevant to the query being asked
docs = retriever.get_relevant_documents(query)

# create embedding from the document retrieved and place in a FAISS vector database
db = FAISS.from_documents(documents=docs, embedding=embeddings)

# set up the chain that will feed the retrieved document to the LLM
chain = RetrievalQA.from_chain_type(llm=llm, retriever = db.as_retriever(), chain_type="stuff")

# run the chain on the query asked
response = chain.run(query)
st.session_state.generated.append(response)

with st.expander('Vector Search'):
for i in range(len(st.session_state.generated)-1, -1, -1):
st.info(st.session_state.past[i])
st.success(st.session_state.generated[i])

if __name__ == '__main__':
config_keys()
main()
102 changes: 102 additions & 0 deletions tutorials/notebooks/GenAI/embedding_demos/aoai_embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import openai
from openai.embeddings_utils import get_embedding, cosine_similarity # must pip install openai[embeddings]
import pandas as pd
import numpy as np
import os
import streamlit as st
import time
from PIL import Image
from dotenv import load_dotenv

# load in .env variables
load_dotenv()

# configure azure openai keys
openai.api_type = 'azure'
openai.api_version = os.environ['AZURE_OPENAI_VERSION']
openai.api_base = os.environ['AZURE_OPENAI_ENDPOINT']
openai.api_key = os.environ['AZURE_OPENAI_KEY']

def embedding_create():
# acquire the filename to be embed
st.subheader("Vector Creation")
st.write('This program is designed to embed your pre-chunked .csv file. \
By accomplishing this task, you will be able to chat over all cotent in your .csv via vector searching. \
Just enter the file and the program will take care of the rest (specify file path if not in this directory). \
Welcome to CHATGPT over your own data !!')
filename = st.text_input("Enter a file: ", key='filename', value="")

# start the embeddings process if filename provided
if filename:

# read the data file to be embed
df = pd.read_csv('C:\\src\\AzureOpenAI_Gov_Workshop\\' + filename)
st.write(df)

# calculate word embeddings
df['embedding'] = df['text'].apply(lambda x:get_embedding(x, engine='text-embedding-ada-002'))
df.to_csv('C:\\src\\AzureOpenAI_Gov_Workshop\\microsoft-earnings_embeddings.csv')
time.sleep(3)
st.subheader("Post Embedding")
st.success('Embeddings Created Sucessfully!!')
st.write(df)


def embeddings_search():

# Streamlit configuration
st.subheader("Vector Search")
st.write('This program is designed to chat over your vector stored (embedding) .csv file. \
This Chat Bot works alongside the "Embeddings Bot" Chat Bot. \
Be specific with the information you want to obtain over your data. \
Welcome to CHATGPT over your own data !!')
if 'answer' not in st.session_state:
st.session_state.answer = []
if 'score' not in st.session_state:
st.session_state.score = []
if 'past' not in st.session_state:
st.session_state.past = []

# read in the embeddings .csv
# convert elements in 'embedding' column back to numpy array
df = pd.read_csv('C:\\src\\AzureOpenAI_Gov_Workshop\\microsoft-earnings_embeddings.csv')
df['embedding'] = df['embedding'].apply(eval).apply(np.array)

# caluculate user query embedding
search_term = st.text_input("Enter a search query: ", key='search_term', placeholder="")
if search_term:
st.session_state.past.append(search_term)
search_term_vector = get_embedding(search_term, engine='text-embedding-ada-002')

# find similiarity between query and vectors
df['similarities'] = df['embedding'].apply(lambda x:cosine_similarity(x, search_term_vector))
df1 = df.sort_values("similarities", ascending=False).head(5)

# output the response
answer = df1['text'].loc[df1.index[0]]
score = df1['similarities'].loc[df1.index[0]]
st.session_state.answer.append(answer)
st.session_state.score.append(score)
with st.expander('Vector Search'):
for i in range(len(st.session_state.answer)-1, -1, -1):
st.info(st.session_state.past[i])
st.write(st.session_state.answer[i])
st.write('Score: ', st.session_state.score[i])


def main():
# Streamlit config
st.title("Demo-Azure OpenAI Embeddings")
image = Image.open('image_logo2.png')
st.image(image, caption = '')
st.sidebar.title('Chat Bot Type Selection')
chat_style = st.sidebar.selectbox(
'Choose between Embeddings Bot or Search Bot', ['Embeddings Bot','Search Bot']
)
if chat_style == 'Embeddings Bot':
embedding_create()
elif chat_style == 'Search Bot':
embeddings_search()

if __name__ == '__main__':
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from langchain_community.retrievers import AzureCognitiveSearchRetriever
from langchain_openai import AzureChatOpenAI
import sys
import json
import os


class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKCYAN = '\033[96m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'

MAX_HISTORY_LENGTH = 1

def build_chain():

os.getenv("AZURE_OPENAI_API_KEY")
os.getenv("AZURE_OPENAI_ENDPOINT")
os.getenv("AZURE_COGNITIVE_SEARCH_SERVICE_NAME")
os.getenv("AZURE_COGNITIVE_SEARCH_INDEX_NAME")
os.getenv("AZURE_COGNITIVE_SEARCH_API_KEY")
AZURE_OPENAI_DEPLOYMENT_NAME = os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"]

llm = AzureChatOpenAI(
openai_api_version="2023-05-15",
azure_deployment=AZURE_OPENAI_DEPLOYMENT_NAME,
#max_tokens = 3000
)

retriever = AzureCognitiveSearchRetriever(content_key="content", top_k=2)


prompt_template = """
Instructions:
I will provide you question and scientific documents you will answer my question with information from documents in English, and you will create a cumulative summary that should be concise and should accurately.
You should not include any personal opinions or interpretations in your summary, but rather focus on objectively presenting the information from the papers.
Your summary should be written in your own words and ensure that your summary is clear, and concise.
{question} Answer "don't know" if not present in the documents.
{context}
Solution:"""


PROMPT = PromptTemplate(
template=prompt_template, input_variables=["context", "question"],
)

condense_qa_template = """
Chat History:
{chat_history}
Here is a new question for you: {question}
Standalone question:"""
standalone_question_prompt = PromptTemplate.from_template(condense_qa_template)

qa = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=retriever,
condense_question_prompt=standalone_question_prompt,
return_source_documents=True,
combine_docs_chain_kwargs={"prompt":PROMPT}
)
return qa

def run_chain(chain, prompt: str, history=[]):
print(prompt)
return chain({"question": prompt, "chat_history": history})

if __name__ == "__main__":
chat_history = []
qa = build_chain()
print(bcolors.OKBLUE + "Hello! How can I help you?" + bcolors.ENDC)
print(bcolors.OKCYAN + "Ask a question, start a New search: or CTRL-D to exit." + bcolors.ENDC)
print(">", end=" ", flush=True)
for query in sys.stdin:
if (query.strip().lower().startswith("new search:")):
query = query.strip().lower().replace("new search:","")
chat_history = []
elif (len(chat_history) == MAX_HISTORY_LENGTH):
chat_history.pop(0)
result = run_chain(qa, query, chat_history)
chat_history.append((query, result["answer"]))
print(bcolors.OKGREEN + result['answer'] + bcolors.ENDC)
if 'source_documents' in result:
print(bcolors.OKGREEN + 'Sources:')
for d in result['source_documents']:
dict_meta=json.loads(d.metadata['metadata'])
print(dict_meta['source'])
print(bcolors.ENDC)
print(bcolors.OKCYAN + "Ask a question, start a New search: or CTRL-D to exit." + bcolors.ENDC)
print(">", end=" ", flush=True)
print(bcolors.OKBLUE + "Bye" + bcolors.ENDC)
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from langchain.retrievers import PubMedRetriever
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
import sys
import json
import os


class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKCYAN = '\033[96m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'

MAX_HISTORY_LENGTH = 1

def build_chain():
os.getenv("AZURE_OPENAI_API_KEY")
os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_DEPLOYMENT_NAME = os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"]

llm = AzureChatOpenAI(
openai_api_version="2023-05-15",
azure_deployment=AZURE_OPENAI_DEPLOYMENT_NAME,
#max_tokens = 3000
)

retriever= PubMedRetriever()

prompt_template = """
Ignore everything before.
Instructions:
I will provide you with research papers on a specific topic in English, and you will create a cumulative summary.
The summary should be concise and should accurately and objectively communicate the takeaway of the papers related to the topic.
You should not include any personal opinions or interpretations in your summary, but rather focus on objectively presenting the information from the papers.
Your summary should be written in your own words and ensure that your summary is clear, concise, and accurately reflects the content of the original papers. First, provide a concise summary then citations at the end.
{question} Answer "don't know" if not present in the document.
{context}
Solution:"""


PROMPT = PromptTemplate(
template=prompt_template, input_variables=["context", "question"],
)

condense_qa_template = """
Chat History:
{chat_history}
Here is a new question for you: {question}
Standalone question:"""
standalone_question_prompt = PromptTemplate.from_template(condense_qa_template)

qa = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=retriever,
condense_question_prompt=standalone_question_prompt,
return_source_documents=True,
combine_docs_chain_kwargs={"prompt":PROMPT},
)
return qa

def run_chain(chain, prompt: str, history=[]):
print(prompt)
return chain({"question": prompt, "chat_history": history})

if __name__ == "__main__":
chat_history = []
qa = build_chain()
print(bcolors.OKBLUE + "Hello! How can I help you?" + bcolors.ENDC)
print(bcolors.OKCYAN + "Ask a question, start a New search: or CTRL-D to exit." + bcolors.ENDC)
print(">", end=" ", flush=True)
for query in sys.stdin:
if (query.strip().lower().startswith("new search:")):
query = query.strip().lower().replace("new search:","")
chat_history = []
elif (len(chat_history) == MAX_HISTORY_LENGTH):
chat_history.pop(0)
result = run_chain(qa, query, chat_history)
chat_history.append((query, result["answer"]))
print(bcolors.OKGREEN + result['answer'] + bcolors.ENDC)
if 'source_documents' in result:
print(bcolors.OKGREEN + 'Sources:')
for idx, ref in enumerate(result["source_documents"]):
print("PubMed UID: "+ref.metadata["uid"])
print(bcolors.ENDC)
print(bcolors.OKCYAN + "Ask a question, start a New search: or CTRL-D to exit." + bcolors.ENDC)
print(">", end=" ", flush=True)
print(bcolors.OKBLUE + "Bye" + bcolors.ENDC)
Loading

0 comments on commit aca5975

Please sign in to comment.