diff --git a/notebooks/GenAI/.gitignore b/notebooks/GenAI/.gitignore new file mode 100644 index 0000000..ad23308 --- /dev/null +++ b/notebooks/GenAI/.gitignore @@ -0,0 +1,4 @@ +__pycache__ +.venv +.env +microsoft-earnings_embeddings.csv \ No newline at end of file diff --git a/notebooks/GenAI/embedding_demos/Demo_Suite.py b/notebooks/GenAI/embedding_demos/Demo_Suite.py new file mode 100644 index 0000000..327eb8d --- /dev/null +++ b/notebooks/GenAI/embedding_demos/Demo_Suite.py @@ -0,0 +1,61 @@ +import streamlit as st +from styling import global_page_style + +def main(): + # Set page configuration + # st.set_page_config(page_title="Azure OpenAI RAG Demo Suite", layout="wide") + + # Title and subtitle + # Create columns for logo and title + + st.markdown( + f'
', + unsafe_allow_html=True + ) + st.title("Azure OpenAI RAG Demo Suite") + st.markdown("### Demo Overviews") + st.write(""" + Welcome to the Azure OpenAI RAG Demo Suite. On the left side-panel, you will find various demonstrations that showcase the capabilities of Azure OpenAI with a Streamlit frontend. Each demonstration is described in detail below, highlighting their unique features and functionalities. + """) + + # Horizontal divider + st.markdown("---") + + # Chat with Your Data section + st.markdown("### Chat with Your Data using Azure OpenAI API and AI Search Index (AI Search Query)") + st.write(""" + This demo allows users to interact with data stored in their Azure AI Search Index using a combination of semantic and vector search methods. + """) + st.write(""" + - **Semantic Search**: Understands the meaning and context of your queries to deliver more relevant results. + - **Vector Search**: Utilizes numerical representations of text to find similar content based on cosine similarity. + """) + # Ensure the user has created the Azure AI search index already + st.write(""" + **Note**: Users must have created the Azure AI search index already as shown here: [Upload your own data and query over it](https://github.com/STRIDES/NIHCloudLabAzure/blob/main/notebooks/GenAI/Azure_Open_AI_README.md) + """) + + # Horizontal divider + st.markdown("---") + + # Generate & Search with Azure OpenAI Embeddings section + st.markdown("### Generate & Search with Azure OpenAI Embeddings (AOAI Embeddings)") + st.write(""" + This demo enables users to generate embeddings from a pre-chunked CSV file and perform searches over the content using vector search. + """) + st.write(""" + - **Vectorize**: Creates embeddings based on the "microsoft-earnings.csv" file provided in this directory. The embeddings are generated from the "text" column. The CSV file is pre-chunked, meaning the text has already been split and prepared for embedding generation. A new CSV file will be created to store all generated embeddings, forming your vector store. + - **Retrieve**: Generates embeddings based on user queries. The query embedding is then used to search for the most similar document within the vector store using cosine similarity. + """) + st.write(""" + Example questions a user can ask about the microsoft-earnings.csv: + - What was said about the budget? + - How many people utilize GitHub to build software? + - How many points did Microsoft Cloud gross margin percentage increase by? + - What are the expectations for the Q2 cash flow? + """) + + +if __name__ == '__main__': + global_page_style() + main() \ No newline at end of file diff --git a/notebooks/GenAI/embedding_demos/acs_embeddings.py b/notebooks/GenAI/embedding_demos/acs_embeddings.py deleted file mode 100644 index 8a4a68a..0000000 --- a/notebooks/GenAI/embedding_demos/acs_embeddings.py +++ /dev/null @@ -1,79 +0,0 @@ -from langchain.retrievers import AzureCognitiveSearchRetriever -from langchain.embeddings import OpenAIEmbeddings -from langchain.vectorstores import FAISS -from langchain.chains import RetrievalQA -from langchain.chat_models import AzureChatOpenAI -from PIL import Image -import os -import streamlit as st -from dotenv import load_dotenv - -# load in .env variables -load_dotenv() - -def config_keys(): - # set api keys for AOAI and Azure Search - os.environ['OPENAI_API_VERSION'] = os.getenv('AZURE_OPENAI_VERSION') - os.environ['OPENAI_API_KEY'] = os.getenv('AZURE_OPENAI_KEY') - os.environ['OPENAI_API_BASE'] = os.getenv('AZURE_OPENAI_ENDPOINT') - os.environ['OPENAI_EMBEDDING_DEPLOYMENT_NAME'] = os.getenv('AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME') - os.environ['AZURE_COGNITIVE_SEARCH_SERVICE_NAME'] = os.getenv('AZURE_COGNITIVE_SEARCH_SERVICE_NAME') - os.environ['AZURE_COGNITIVE_SEARCH_API_KEY'] = os.getenv('AZURE_COGNITIVE_SEARCH_API_KEY') - os.environ['AZURE_COGNITIVE_SEARCH_INDEX_NAME'] = os.getenv('AZURE_COGNITIVE_SEARCH_INDEX_NAME') - - -def main(): - # Streamlit config - st.title("Demo - Azure OpenAI & Cognitive Search Embeddings") - image = Image.open('image_logo2.png') - st.image(image, caption = '') - st.write('This program is designed to chat over your files in Azure Cognitive Search. \ - Be specific and clear with the questions you ask. \ - Welcome to CHATGPT over your own data !!') - if 'generated' not in st.session_state: - st.session_state.generated = [] - if 'past' not in st.session_state: - st.session_state.past = [] - - # create your LLM and embeddings. Will be conifuring 'azure' in the openai_api_type parameter. - llm = AzureChatOpenAI( - deployment_name = "gpt-35-turbo", - openai_api_type = "azure", - model = "gpt-35-turbo", - temperature=0.7, - max_tokens=200 - ) - - embeddings = OpenAIEmbeddings(chunk_size=1, openai_api_type="azure") - - # ask for the user query - query = st.text_input("Enter a search query: ", key='search_term', placeholder="") - - if query: - st.session_state.past.append(query) - - # set up Azure Cognitive Search to retrieve documents - # top_k = 1: we only want first related doc - retriever = AzureCognitiveSearchRetriever(content_key="content", top_k=1) - - # get the relevant document from Azure Cognitive Search that are only relevant to the query being asked - docs = retriever.get_relevant_documents(query) - - # create embedding from the document retrieved and place in a FAISS vector database - db = FAISS.from_documents(documents=docs, embedding=embeddings) - - # set up the chain that will feed the retrieved document to the LLM - chain = RetrievalQA.from_chain_type(llm=llm, retriever = db.as_retriever(), chain_type="stuff") - - # run the chain on the query asked - response = chain.run(query) - st.session_state.generated.append(response) - - with st.expander('Vector Search'): - for i in range(len(st.session_state.generated)-1, -1, -1): - st.info(st.session_state.past[i]) - st.success(st.session_state.generated[i]) - -if __name__ == '__main__': - config_keys() - main() diff --git a/notebooks/GenAI/embedding_demos/aoai_embeddings.py b/notebooks/GenAI/embedding_demos/aoai_embeddings.py deleted file mode 100644 index eb694c7..0000000 --- a/notebooks/GenAI/embedding_demos/aoai_embeddings.py +++ /dev/null @@ -1,102 +0,0 @@ -import openai -from openai.embeddings_utils import get_embedding, cosine_similarity # must pip install openai[embeddings] -import pandas as pd -import numpy as np -import os -import streamlit as st -import time -from PIL import Image -from dotenv import load_dotenv - -# load in .env variables -load_dotenv() - -# configure azure openai keys -openai.api_type = 'azure' -openai.api_version = os.environ['AZURE_OPENAI_VERSION'] -openai.api_base = os.environ['AZURE_OPENAI_ENDPOINT'] -openai.api_key = os.environ['AZURE_OPENAI_KEY'] - -def embedding_create(): - # acquire the filename to be embed - st.subheader("Vector Creation") - st.write('This program is designed to embed your pre-chunked .csv file. \ - By accomplishing this task, you will be able to chat over all cotent in your .csv via vector searching. \ - Just enter the file and the program will take care of the rest (specify file path if not in this directory). \ - Welcome to CHATGPT over your own data !!') - filename = st.text_input("Enter a file: ", key='filename', value="") - - # start the embeddings process if filename provided - if filename: - - # read the data file to be embed - df = pd.read_csv('C:\\src\\AzureOpenAI_Gov_Workshop\\' + filename) - st.write(df) - - # calculate word embeddings - df['embedding'] = df['text'].apply(lambda x:get_embedding(x, engine='text-embedding-ada-002')) - df.to_csv('C:\\src\\AzureOpenAI_Gov_Workshop\\microsoft-earnings_embeddings.csv') - time.sleep(3) - st.subheader("Post Embedding") - st.success('Embeddings Created Sucessfully!!') - st.write(df) - - -def embeddings_search(): - - # Streamlit configuration - st.subheader("Vector Search") - st.write('This program is designed to chat over your vector stored (embedding) .csv file. \ - This Chat Bot works alongside the "Embeddings Bot" Chat Bot. \ - Be specific with the information you want to obtain over your data. \ - Welcome to CHATGPT over your own data !!') - if 'answer' not in st.session_state: - st.session_state.answer = [] - if 'score' not in st.session_state: - st.session_state.score = [] - if 'past' not in st.session_state: - st.session_state.past = [] - - # read in the embeddings .csv - # convert elements in 'embedding' column back to numpy array - df = pd.read_csv('C:\\src\\AzureOpenAI_Gov_Workshop\\microsoft-earnings_embeddings.csv') - df['embedding'] = df['embedding'].apply(eval).apply(np.array) - - # caluculate user query embedding - search_term = st.text_input("Enter a search query: ", key='search_term', placeholder="") - if search_term: - st.session_state.past.append(search_term) - search_term_vector = get_embedding(search_term, engine='text-embedding-ada-002') - - # find similiarity between query and vectors - df['similarities'] = df['embedding'].apply(lambda x:cosine_similarity(x, search_term_vector)) - df1 = df.sort_values("similarities", ascending=False).head(5) - - # output the response - answer = df1['text'].loc[df1.index[0]] - score = df1['similarities'].loc[df1.index[0]] - st.session_state.answer.append(answer) - st.session_state.score.append(score) - with st.expander('Vector Search'): - for i in range(len(st.session_state.answer)-1, -1, -1): - st.info(st.session_state.past[i]) - st.write(st.session_state.answer[i]) - st.write('Score: ', st.session_state.score[i]) - - -def main(): - # Streamlit config - st.title("Demo-Azure OpenAI Embeddings") - image = Image.open('image_logo2.png') - st.image(image, caption = '') - st.sidebar.title('Chat Bot Type Selection') - chat_style = st.sidebar.selectbox( - 'Choose between Embeddings Bot or Search Bot', ['Embeddings Bot','Search Bot'] - ) - if chat_style == 'Embeddings Bot': - embedding_create() - elif chat_style == 'Search Bot': - embeddings_search() - -if __name__ == '__main__': - main() diff --git a/notebooks/GenAI/embedding_demos/pages/AI_Search_Query.py b/notebooks/GenAI/embedding_demos/pages/AI_Search_Query.py new file mode 100644 index 0000000..c1c18cd --- /dev/null +++ b/notebooks/GenAI/embedding_demos/pages/AI_Search_Query.py @@ -0,0 +1,106 @@ +from openai import AzureOpenAI +import os +import streamlit as st +from dotenv import load_dotenv +from styling import global_page_style + +# load in .env variables +load_dotenv() + +# Configure Azure OpenAI params, using an Azure OpenAI account with a deployment of an embedding model +azure_endpoint: str = os.getenv('AZURE_OPENAI_BASE') +azure_openai_api_key: str = os.getenv('AZURE_OPENAI_KEY') +azure_openai_api_version: str = os.getenv('AZURE_OPENAI_VERSION') +azure_ada_deployment: str = os.getenv('AZURE_EMBEDDINGS_DEPLOYMENT') +azure_gpt_deployment: str = os.getenv('AZURE_GPT_DEPLOYMENT') + +# Configure Azure AI Search params +search_endpoint: str = os.getenv('AZURE_SEARCH_ENDPOINT') +search_key: str = os.getenv('AZURE_SEARCH_ADMIN_KEY') + +def chat_on_your_data(query, search_index, messages): + messages.append({"role": "user", "content":query}) + with st.chat_message("user"): + st.markdown(query) + with st.spinner('Processing...'): + client = AzureOpenAI( + azure_endpoint=azure_endpoint, + api_key=azure_openai_api_key, + api_version=azure_openai_api_version, + ) + completion = client.chat.completions.create( + model=azure_gpt_deployment, + messages=[ + {"role": "system", "content": "You are an AI assistant that helps people find information. \ + Ensure the Markdown responses are correctly formatted before responding."}, + {"role": "user", "content": query} + ], + max_tokens=800, + temperature=0.7, + top_p=0.95, + frequency_penalty=0, + presence_penalty=0, + stop=None, + stream=False, + extra_body={ + "data_sources": [{ + "type": "azure_search", + "parameters": { + "endpoint": f"{search_endpoint}", + "index_name": search_index, + "semantic_configuration": "default", + "query_type": "vector_simple_hybrid", + "fields_mapping": {}, + "in_scope": True, + "role_information": "You are an AI assistant that helps people find information.", + "filter": None, + "strictness": 3, + "top_n_documents": 5, + "authentication": { + "type": "api_key", + "key": f"{search_key}" + }, + "embedding_dependency": { + "type": "deployment_name", + "deployment_name": azure_ada_deployment + } + } + }] + } + ) + print(completion) + response_data = completion.to_dict() + ai_response = response_data['choices'][0]['message']['content'] + messages.append({"role": "assistant", "content":ai_response}) + with st.chat_message("assistant"): + st.markdown(ai_response) + +def main(): + st.markdown( + f'', + unsafe_allow_html=True + ) + st.title("Demo - Azure OpenAI & AI Search") + # image = Image.open('image_logo2.png') + # st.image(image, caption = '') + st.write('This demo showcases an innovative way for users to engage with data housed in their Azure AI Search Index by leveraging both \ + semantic and vector search techniques. Semantic search enhances the querying process by comprehending the meaning and context of \ + user queries, thereby providing more pertinent results. Vector search, on the other hand, employs numerical representations of \ + text to identify similar content using cosine similarity. ***For users to effectively utilize this demo, it is essential that they \ + have previously created their Azure AI Search Index, following the necessary steps to upload and query their data as outlined [here](https://github.com/STRIDES/NIHCloudLabAzure/blob/main/notebooks/GenAI/Azure_Open_AI_README.md).***') + if 'messages' not in st.session_state: + st.session_state.messages = [] + index_name = st.text_input(label="Azure AI Search index name:", value="") + st.write('-'*50) + if index_name: + query = st.chat_input('Input search query here...') + for message in st.session_state.messages: + with st.chat_message(message["role"]): + st.markdown(message['content']) + if query: + chat_on_your_data(query, index_name, st.session_state.messages) + + +if __name__ == '__main__': + global_page_style() + main() \ No newline at end of file diff --git a/notebooks/GenAI/embedding_demos/pages/AOAI_Embeddings.py b/notebooks/GenAI/embedding_demos/pages/AOAI_Embeddings.py new file mode 100644 index 0000000..77702de --- /dev/null +++ b/notebooks/GenAI/embedding_demos/pages/AOAI_Embeddings.py @@ -0,0 +1,126 @@ +from openai import AzureOpenAI +# from openai.embeddings_utils import get_embedding, cosine_similarity # must pip install openai[embeddings] +import pandas as pd +import numpy as np +import os +import streamlit as st +import time +from PIL import Image +from dotenv import load_dotenv +from styling import global_page_style + +# load in .env variables +load_dotenv() + +# configure azure openai keys +# openai.api_type = 'azure' +# openai.api_version = os.environ['AZURE_OPENAI_VERSION'] +# openai.api_base = os.environ['AZURE_OPENAI_ENDPOINT'] +# openai.api_key = os.environ['AZURE_OPENAI_KEY'] + +def get_embedding(text, engine): + client = AzureOpenAI( + api_key=os.getenv("Azure_OPENAI_KEY"), + api_version=os.getenv('AZURE_OPENAI_VERSION'), + azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT') + ) + + embeddings = client.embeddings.create(input = [text], model=engine).data[0].embedding + return embeddings + +def cosine_similarity(a, b): + return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) + +def embedding_create(): + # acquire the filename to be embed + st.subheader("Vector Creation") + st.write('The process of vectorization involves creating embeddings from the [microsoft-earnings.csv](https://github.com/STRIDES/NIHCloudLabAzure/blob/main/notebooks/GenAI/microsoft-earnings.csv) \ + file located in the specified directory, utilizing the data in the "text" column. These embeddings are derived from pre-chunked text, \ + indicating that the text has already been divided and formatted for embedding generation. The resultant embeddings will be \ + compiled into a new CSV file, which will serve as a vector store for future reference and utilization.') + filename = st.text_input("Enter a file: ", key='filename', value="microsoft-earnings.csv") + + # start the embeddings process if filename provided + if filename: + file_path = os.path.join('..', filename) + # read the data file to be embed + df = pd.read_csv(file_path) + df_placeholder = st.empty() + df_placeholder.dataframe(df, width=2000, height=350) + button_placeholder = st.empty() + if button_placeholder.button("Generate Embeddings"): + # calculate word embeddings + df['embedding'] = df['text'].apply(lambda x:get_embedding(x, engine=os.getenv("AZURE_EMBEDDINGS_DEPLOYMENT"))) + df.to_csv('.\\microsoft-earnings_embeddings.csv') + time.sleep(3) + button_placeholder.success('Embeddings Created Sucessfully!!') + df_placeholder.dataframe(df) + + +def embeddings_search(): + + # Streamlit configuration + st.subheader("Vector Search") + st.write('This process generates embeddings based on user queries, utilizing the compiled CSV that was created, to search for the most similar\ + documents within the vector store by employing cosine similarity.') + if 'answer' not in st.session_state: + st.session_state.answer = [] + if 'score' not in st.session_state: + st.session_state.score = [] + if 'past' not in st.session_state: + st.session_state.past = [] + + # read in the embeddings .csv + # convert elements in 'embedding' column back to numpy array + df = pd.read_csv('.\\microsoft-earnings_embeddings.csv') + df['embedding'] = df['embedding'].apply(eval).apply(np.array) + + # caluculate user query embedding + search_term = st.text_area("Enter a search query: ", key='search_term', placeholder="") + if search_term: + st.session_state.past.append(search_term) + search_term_vector = get_embedding(search_term, engine=os.getenv("AZURE_EMBEDDINGS_DEPLOYMENT")) + + # find similiarity between query and vectors + df['similarities'] = df['embedding'].apply(lambda x:cosine_similarity(x, search_term_vector)) + df1 = df.sort_values("similarities", ascending=False).head(5) + + # output the response + answer = df1['text'].loc[df1.index[0]] + score = df1['similarities'].loc[df1.index[0]] + st.session_state.answer.append(answer) + st.session_state.score.append(score) + with st.expander('Vector Search'): + for i in range(len(st.session_state.answer)-1, -1, -1): + st.info(st.session_state.past[i]) + st.write(st.session_state.answer[i]) + st.write('Score: ', st.session_state.score[i]) + with st.expander('Top 5 Results'): + df1 = df1.reset_index(drop=True) + df1.index = df1.index + 1 + df1 = df1.rename(columns={'Unnamed: 0': 'Row Number'}) + print(df1) + st.dataframe(df1) + + +def main(): + st.markdown( + f'', + unsafe_allow_html=True + ) + st.title("Demo-Azure OpenAI Embeddings") + # image = Image.open('image_logo2.png') + # st.image(image, caption = '') + st.sidebar.title('Embedding Function Selection') + chat_style = st.sidebar.radio( + 'Choose an Embedding function below:', + ['Vectorize', 'Retrieve'] + ) + if chat_style == 'Vectorize': + embedding_create() + elif chat_style == 'Retrieve': + embeddings_search() + +if __name__ == '__main__': + global_page_style() + main() diff --git a/notebooks/GenAI/embedding_demos/style.css b/notebooks/GenAI/embedding_demos/style.css new file mode 100644 index 0000000..e4ebec9 --- /dev/null +++ b/notebooks/GenAI/embedding_demos/style.css @@ -0,0 +1,116 @@ +/* Global font family */ +body, .stApp, .stApp * { + font-family: sans-serif; +} + +.stApp > header { + background-color: #ffffff; + color: #f8f8f8; + padding: 2rem; +} + +.stApp { + background-color: #ffffff; + color: #000000; + padding: 2rem; +} + +/* Sidebar styling */ +[data-testid="stSidebar"] { + background-color: #2c5483; + color: #ffffff; +} + +[data-testid="stSidebarNav"] { + background-image: url(https://cloud.nih.gov/nih_logo.png); + background-repeat: no-repeat; + padding-top: 175px; + background-position: center 0px; + background-size: 200px; +} + +[data-testid="stSidebarNav"]::before { + content: ""; + margin-left: 60px; + margin-top: 20px; + font-size: 30px; + position: relative; + top: 100px; + font-weight: bold; + text-align: center; +} + +/* Ensure all text in sidebar is white */ +[data-testid="stSidebar"] * { + color: #ffffff !important; +} + +/* Success message styling */ +.success-message { + text-align: center; + margin: 20px 0; +} + +/* Main title styling */ +h1 { + color: #000400; + text-align: center; + margin-bottom: 20px; + font-size: 40px; +} + +/* Select box styling */ +.css-2trqyj { + font-size: 18px; +} + +/* Number input styling */ +.css-1siy2j7 { + font-size: 18px; +} + +/* Radio button styling */ +.css-1awt6go { + font-size: 18px; +} + +/* DataFrame styling */ +.css-1l269bu { + margin-top: 20px; +} + +/* Spinner styling */ +.css-1f6lu8k { + margin-top: 20px; + text-align: center; +} + +/* Pages Menu styling */ +.st-emotion-cache-1rtdyuf.eczjsme13 { + color: #ffffff; /* This sets the text color to white */ +} + +.st-emotion-cache-6tkfeg.eczjsme13 { + color: #ffffff !important; +} + +.css-10trblm, .css-1r7vkyz, .css-2trqyj { + color: black; +} + +/* Modify the streamlit menu in top right corner */ +.st-emotion-cache-w3nhqi.ef3psqc5 { + color: black !important; +} + +.st-emotion-cache-1wbqy5l.e17vllj40 { + color: black !important; +} + +.st-emotion-cache-jdyw56.en6cib60 { + color: black !important; +} + +.st-emotion-cache-cgqxho.ef3psqc6 { + color: black !important; +} \ No newline at end of file diff --git a/notebooks/GenAI/embedding_demos/styling.py b/notebooks/GenAI/embedding_demos/styling.py new file mode 100644 index 0000000..da99bcf --- /dev/null +++ b/notebooks/GenAI/embedding_demos/styling.py @@ -0,0 +1,7 @@ +import streamlit as st + +def global_page_style(): + st.set_page_config(layout="centered") + with open('style.css') as f: + css = f.read() + st.markdown(f'', unsafe_allow_html=True) \ No newline at end of file diff --git a/notebooks/GenAI/requirements.txt b/notebooks/GenAI/requirements.txt index ebf0d31..56ca87e 100644 --- a/notebooks/GenAI/requirements.txt +++ b/notebooks/GenAI/requirements.txt @@ -1,12 +1,7 @@ python-dotenv openai -openai[embeddings] pandas numpy streamlit -langchain -langchain-openai -langchain-community -azure-search-documents==11.4.0b6 -tiktoken -faiss-cpu +azure-search-documents +azure-identity