Skip to content

Commit

Permalink
[docs]: standardize doc loader doc strings (#25325)
Browse files Browse the repository at this point in the history
  • Loading branch information
isahers1 authored Aug 13, 2024
1 parent e0bbb81 commit f4ffd69
Show file tree
Hide file tree
Showing 8 changed files with 345 additions and 70 deletions.
48 changes: 34 additions & 14 deletions docs/docs/integrations/document_loaders/firecrawl.ipynb

Large diffs are not rendered by default.

34 changes: 27 additions & 7 deletions docs/docs/integrations/document_loaders/pypdfloader.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"outputs": [
{
Expand All @@ -131,21 +131,41 @@
"6"
]
},
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"page = []\n",
"pages = []\n",
"for doc in loader.lazy_load():\n",
" page.append(doc)\n",
" if len(page) >= 10:\n",
" pages.append(doc)\n",
" if len(pages) >= 10:\n",
" # do some paged operation, e.g.\n",
" # index.upsert(page)\n",
"\n",
" page = []\n",
"len(page)"
" pages = []\n",
"len(pages)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"LayoutParser : A Unified Toolkit for DL-Based DIA 11\n",
"focuses on precision, efficiency, and robustness. \n",
"{'source': './example_data/layout-parser-paper.pdf', 'page': 10}\n"
]
}
],
"source": [
"print(pages[0].page_content[:100])\n",
"print(pages[0].metadata)"
]
},
{
Expand Down
8 changes: 4 additions & 4 deletions docs/docs/integrations/document_loaders/recursive_url.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -229,14 +229,14 @@
}
],
"source": [
"page = []\n",
"pages = []\n",
"for doc in loader.lazy_load():\n",
" page.append(doc)\n",
" if len(page) >= 10:\n",
" pages.append(doc)\n",
" if len(pages) >= 10:\n",
" # do some paged operation, e.g.\n",
" # index.upsert(page)\n",
"\n",
" page = []"
" pages = []"
]
},
{
Expand Down
77 changes: 65 additions & 12 deletions docs/docs/integrations/document_loaders/web_base.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -250,27 +250,80 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 2,
"id": "303d2f4e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(metadata={'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}, page_content=\"\\n\\n\\n\\n\\n\\n\\n\\n\\nESPN - Serving Sports Fans. Anytime. Anywhere.\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n Skip to main content\\n \\n\\n Skip to navigation\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n<\\n\\n>\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nMenuESPN\\n\\n\\n\\n\\n\\nscores\\n\\n\\n\\nNFLNBAMLBOlympicsSoccerWNBA…BoxingCFLNCAACricketF1GolfHorseLLWSMMANASCARNBA G LeagueNBA Summer LeagueNCAAFNCAAMNCAAWNHLNWSLPLLProfessional WrestlingRacingRN BBRN FBRugbySports BettingTennisX GamesUFLMore ESPNFantasyWatchESPN BETESPN+\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\nSubscribe Now\\n\\n\\n\\n\\n\\nBoxing: Crawford vs. Madrimov (ESPN+ PPV)\\n\\n\\n\\n\\n\\n\\n\\nPFL Playoffs: Heavyweights & Women's Flyweights\\n\\n\\n\\n\\n\\n\\n\\nMLB\\n\\n\\n\\n\\n\\n\\n\\nLittle League Baseball: Regionals\\n\\n\\n\\n\\n\\n\\n\\nIn The Arena: Serena Williams\\n\\n\\n\\n\\n\\n\\n\\nThe 30 College Football Playoff Contenders\\n\\n\\nQuick Links\\n\\n\\n\\n\\n2024 Paris Olympics\\n\\n\\n\\n\\n\\n\\n\\nOlympics: Everything to Know\\n\\n\\n\\n\\n\\n\\n\\nMLB Standings\\n\\n\\n\\n\\n\\n\\n\\nSign up: Fantasy Football\\n\\n\\n\\n\\n\\n\\n\\nWNBA Rookie Tracker\\n\\n\\n\\n\\n\\n\\n\\nNBA Free Agency Buzz\\n\\n\\n\\n\\n\\n\\n\\nLittle League Baseball, Softball\\n\\n\\n\\n\\n\\n\\n\\nESPN Radio: Listen Live\\n\\n\\n\\n\\n\\n\\n\\nWatch Golf on ESPN\\n\\n\\n\\n\\n\\n\\nFavorites\\n\\n\\n\\n\\n\\n\\n Manage Favorites\\n \\n\\n\\n\\nCustomize ESPNCreate AccountLog InFantasy\\n\\n\\n\\n\\nFootball\\n\\n\\n\\n\\n\\n\\n\\nBaseball\\n\\n\\n\\n\\n\\n\\n\\nBasketball\\n\\n\\n\\n\\n\\n\\n\\nHockey\\n\\n\\nESPN Sites\\n\\n\\n\\n\\nESPN Deportes\\n\\n\\n\\n\\n\\n\\n\\nAndscape\\n\\n\\n\\n\\n\\n\\n\\nespnW\\n\\n\\n\\n\\n\\n\\n\\nESPNFC\\n\\n\\n\\n\\n\\n\\n\\nX Games\\n\\n\\n\\n\\n\\n\\n\\nSEC Network\\n\\n\\nESPN Apps\\n\\n\\n\\n\\nESPN\\n\\n\\n\\n\\n\\n\\n\\nESPN Fantasy\\n\\n\\n\\n\\n\\n\\n\\nTournament Challenge\\n\\n\\nFollow ESPN\\n\\n\\n\\n\\nFacebook\\n\\n\\n\\n\\n\\n\\n\\nX/Twitter\\n\\n\\n\\n\\n\\n\\n\\nInstagram\\n\\n\\n\\n\\n\\n\\n\\nSnapchat\\n\\n\\n\\n\\n\\n\\n\\nTikTok\\n\\n\\n\\n\\n\\n\\n\\nYouTube\\n\\n\\nCollege football's most entertaining conference? Why the 16-team Big 12 is wiiiiiide open this seasonLong known as one of the sport's most unpredictable conferences, the new-look Big 12 promises another dose of chaos.11hBill ConnellyScott Winters/Icon SportswireUSC, Oregon and the quest to bulk up for the Big TenTo improve on D, the Trojans wanted to bulk up for a new league. They're not the only team trying to do that.10hAdam RittenbergThe 30 teams that can reach the CFPConnelly's best games of the 2024 seasonTOP HEADLINESTeam USA sets world record in 4x400 mixed relayGermany beats France, undefeated in men's hoopsU.S. men's soccer exits Games after Morocco routHungary to protest Khelif's Olympic participationKobe's Staples Center locker sells for record $2.9MDjokovic, Alcaraz to meet again, this time for goldKerr: Team USA lineups based on players' rolesMarchand wins 200m IM; McEvoy takes 50 freeScouting Shedeur Sanders' NFL futureLATEST FROM PARISBreakout stars, best moments and what comes next: Breaking down the Games halfway throughAt the midpoint of the Olympics, we look back at some of the best moments and forward at what's still to come in the second week.35mESPNMustafa Yalcin/Anadolu via Getty ImagesThe numbers behind USA's world record in 4x400 mixed relay4h0:46Medal trackerFull resultsFull coverage of the OlympicsPRESEASON HAS BEGUN!New kickoff rules on display to start Hall of Fame Game19h0:41McAfee on NFL's new kickoff: It looks like a practice drill6h1:11NFL's new kickoff rules debut to mixed reviewsTexans-Bears attracts more bets than MLBTOP RANK BOXINGSATURDAY ON ESPN+ PPVWhy Terence Crawford is playing the long game for a chance to face CaneloCrawford is approaching Saturday's marquee matchup against Israil Madrimov with his sights set on landing a bigger one next: against Canelo Alvarez.10hMike CoppingerMark Robinson/Matchroom BoxingBradley's take: Crawford's power vs. Israil Madrimov's disciplined styleTimothy Bradley Jr. breaks down the junior middleweight title fight.2dTimothy Bradley Jr.Buy Crawford vs. Madrimov on ESPN+ PPVChance to impress for Madrimov -- and UzbekistanHOW FRIDAY WENTMORE FROM THE PARIS OLYMPICSGrant Fisher makes U.S. track history, Marchand wins 4th gold and more Friday at the Paris GamesSha'Carri Richardson made her long-awaited Olympic debut during the women's 100m preliminary round on Friday. Here's everything else you might have missed from Paris.31mESPNGetty ImagesU.S. men's loss to Morocco is a wake-up call before World CupOutclassed by Morocco, the U.S. men's Olympic team can take plenty of lessons with the World Cup on the horizon.4hSam BordenFull coverage of the OlympicsOLYMPIC MEN'S HOOPS SCOREBOARDFRIDAY'S GAMESOLYMPIC STANDOUTSWhy Simone Biles is now Stephen A.'s No. 1 Olympian7h0:58Alcaraz on the cusp of history after securing spot in gold medal match8h0:59Simone Biles' gymnastics titles: Olympics, Worlds, more statsOLYMPIC MEN'S SOCCER SCOREBOARDFRIDAY'S GAMESTRADE DEADLINE FALLOUTOlney: Eight big questions for traded MLB playersCan Jazz Chisholm Jr. handle New York? Is Jack Flaherty healthy? Will Jorge Soler's defense play? Key questions for players in new places.10hBuster OlneyWinslow Townson/Getty ImagesRanking the top prospects who changed teams at the MLB trade deadlineYou know the major leaguers who moved by now -- but what about the potential stars of tomorrow?1dKiley McDanielMLB Power RankingsSeries odds: Dodgers still on top; Phillies, Yanks behind them Top HeadlinesTeam USA sets world record in 4x400 mixed relayGermany beats France, undefeated in men's hoopsU.S. men's soccer exits Games after Morocco routHungary to protest Khelif's Olympic participationKobe's Staples Center locker sells for record $2.9MDjokovic, Alcaraz to meet again, this time for goldKerr: Team USA lineups based on players' rolesMarchand wins 200m IM; McEvoy takes 50 freeScouting Shedeur Sanders' NFL futureFavorites FantasyManage FavoritesFantasy HomeCustomize ESPNCreate AccountLog InICYMI0:47Nelson Palacio rips an incredible goal from outside the boxNelson Palacio scores an outside-of-the-box goal for Real Salt Lake in the 79th minute. \\n\\n\\nMedal Tracker\\n\\n\\n\\nCountries\\nAthletes\\n\\nOverall Medal Leaders43USA36FRA31CHNIndividual Medal LeadersGoldCHN 13FRA 11AUS 11SilverUSA 18FRA 12GBR 10BronzeUSA 16FRA 13CHN 9Overall Medal Leaders4MarchandMarchand3O'CallaghanO'Callaghan3McIntoshMcIntoshIndividual Medal LeadersGoldMarchand 4O'Callaghan 3McIntosh 2SilverSmith 3Huske 2Walsh 2BronzeYufei 3Bhaker 2Haughey 2\\n\\n\\nFull Medal Tracker\\n\\n\\nBest of ESPN+ESPNCollege Football Playoff 2024: 30 teams can reach postseasonHeather Dinich analyzes the teams with at least a 10% chance to make the CFP.AP Photo/Ross D. FranklinNFL Hall of Fame predictions: Who will make the next 10 classes?When will Richard Sherman and Marshawn Lynch make it? Who could join Aaron Donald in 2029? Let's map out each Hall of Fame class until 2034.Thearon W. Henderson/Getty ImagesMLB trade deadline 2024: Ranking prospects who changed teamsYou know the major leaguers who moved by now -- but what about the potential stars of tomorrow who went the other way in those deals? Trending NowIllustration by ESPNRanking the top 100 professional athletes since 2000Who tops our list of the top athletes since 2000? We're unveiling the top 25, including our voters' pick for the No. 1 spot.Photo by Kevin C. Cox/Getty Images2024 NFL offseason recap: Signings, coach moves, new rulesThink you missed something in the NFL offseason? We've got you covered with everything important that has happened since February.Stacy Revere/Getty ImagesTop 25 college football stadiums: Rose Bowl, Michigan and moreFourteen of ESPN's college football writers rank the 25 best stadiums in the sport. Who's No. 1, who missed the cut and what makes these stadiums so special?ESPNInside Nate Robinson's silent battle -- and his fight to liveFor nearly 20 years Nate Robinson has been fighting a silent battle -- one he didn't realize until recently could end his life. Sign up to play the #1 Fantasy game!Create A LeagueJoin Public LeagueReactivate A LeagueMock Draft NowSign up for FREE!Create A LeagueJoin a Public LeagueReactivate a LeaguePractice With a Mock DraftSign up for FREE!Create A LeagueJoin a Public LeagueReactivate a LeaguePractice with a Mock DraftGet a custom ESPN experienceEnjoy the benefits of a personalized accountSelect your favorite leagues, teams and players and get the latest scores, news and updates that matter most to you. \\n\\nESPN+\\n\\n\\n\\n\\nBoxing: Crawford vs. Madrimov (ESPN+ PPV)\\n\\n\\n\\n\\n\\n\\n\\nPFL Playoffs: Heavyweights & Women's Flyweights\\n\\n\\n\\n\\n\\n\\n\\nMLB\\n\\n\\n\\n\\n\\n\\n\\nLittle League Baseball: Regionals\\n\\n\\n\\n\\n\\n\\n\\nIn The Arena: Serena Williams\\n\\n\\n\\n\\n\\n\\n\\nThe 30 College Football Playoff Contenders\\n\\n\\nQuick Links\\n\\n\\n\\n\\n2024 Paris Olympics\\n\\n\\n\\n\\n\\n\\n\\nOlympics: Everything to Know\\n\\n\\n\\n\\n\\n\\n\\nMLB Standings\\n\\n\\n\\n\\n\\n\\n\\nSign up: Fantasy Football\\n\\n\\n\\n\\n\\n\\n\\nWNBA Rookie Tracker\\n\\n\\n\\n\\n\\n\\n\\nNBA Free Agency Buzz\\n\\n\\n\\n\\n\\n\\n\\nLittle League Baseball, Softball\\n\\n\\n\\n\\n\\n\\n\\nESPN Radio: Listen Live\\n\\n\\n\\n\\n\\n\\n\\nWatch Golf on ESPN\\n\\n\\nFantasy\\n\\n\\n\\n\\nFootball\\n\\n\\n\\n\\n\\n\\n\\nBaseball\\n\\n\\n\\n\\n\\n\\n\\nBasketball\\n\\n\\n\\n\\n\\n\\n\\nHockey\\n\\n\\nESPN Sites\\n\\n\\n\\n\\nESPN Deportes\\n\\n\\n\\n\\n\\n\\n\\nAndscape\\n\\n\\n\\n\\n\\n\\n\\nespnW\\n\\n\\n\\n\\n\\n\\n\\nESPNFC\\n\\n\\n\\n\\n\\n\\n\\nX Games\\n\\n\\n\\n\\n\\n\\n\\nSEC Network\\n\\n\\nESPN Apps\\n\\n\\n\\n\\nESPN\\n\\n\\n\\n\\n\\n\\n\\nESPN Fantasy\\n\\n\\n\\n\\n\\n\\n\\nTournament Challenge\\n\\n\\nFollow ESPN\\n\\n\\n\\n\\nFacebook\\n\\n\\n\\n\\n\\n\\n\\nX/Twitter\\n\\n\\n\\n\\n\\n\\n\\nInstagram\\n\\n\\n\\n\\n\\n\\n\\nSnapchat\\n\\n\\n\\n\\n\\n\\n\\nTikTok\\n\\n\\n\\n\\n\\n\\n\\nYouTube\\n\\n\\nTerms of UsePrivacy PolicyYour US State Privacy RightsChildren's Online Privacy PolicyInterest-Based AdsAbout Nielsen MeasurementDo Not Sell or Share My Personal InformationContact UsDisney Ad Sales SiteWork for ESPNCorrectionsESPN BET is owned and operated by PENN Entertainment, Inc. and its subsidiaries ('PENN'). ESPN BET is available in states where PENN is licensed to offer sports wagering. Must be 21+ to wager. If you or someone you know has a gambling problem and wants help, call 1-800-GAMBLER.Copyright: © 2024 ESPN Enterprises, Inc. All rights reserved.\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\")"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"ESPN - Serving Sports Fans. Anytime. Anywhere.\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"{'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}\n"
]
}
],
"source": [
"page = []\n",
"pages = []\n",
"for doc in loader.lazy_load():\n",
" page.append(doc)\n",
" pages.append(doc)\n",
"\n",
"page[0]"
"print(pages[0].page_content[:100])\n",
"print(pages[0].metadata)"
]
},
{
Expand Down
60 changes: 56 additions & 4 deletions libs/community/langchain_community/document_loaders/firecrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,63 @@


class FireCrawlLoader(BaseLoader):
"""Load web pages as Documents using FireCrawl.
Must have Python package `firecrawl` installed and a FireCrawl API key. See
https://www.firecrawl.dev/ for more.
"""
FireCrawlLoader document loader integration
Setup:
Install ``firecrawl-py``,``langchain_community`` and set environment variable ``FIRECRAWL_API_KEY``.
.. code-block:: bash
pip install -U firecrawl-py langchain_community
export FIRECRAWL_API_KEY="your-api-key"
Instantiate:
.. code-block:: python
from langchain_community.document_loaders import FireCrawlLoader
loader = FireCrawlLoader(
url = "https://firecrawl.dev",
mode = "crawl"
# other params = ...
)
Lazy load:
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
# async variant:
# docs_lazy = await loader.alazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)
Join the waitlist to turn any web
{'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}
Async load:
.. code-block:: python
docs = await loader.aload()
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)
Join the waitlist to turn any web
{'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}
""" # noqa: E501

def __init__(
self,
Expand Down
62 changes: 59 additions & 3 deletions libs/community/langchain_community/document_loaders/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,10 +161,66 @@ def load(self) -> List[Document]:


class PyPDFLoader(BasePDFLoader):
"""Load PDF using pypdf into list of documents.
Loader chunks by page and stores page numbers in metadata.
"""
PyPDFLoader document loader integration
Setup:
Install ``langchain-community``.
.. code-block:: bash
pip install -U langchain-community
Instantiate:
.. code-block:: python
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader(
file_path = "./example_data/layout-parser-paper.pdf",
password = "my-pasword",
extract_images = True,
# headers = None
# extraction_mode = "plain",
# extraction_kwargs = None,
)
Lazy load:
.. code-block:: python
docs = []
docs_lazy = loader.lazy_load()
# async variant:
# docs_lazy = await loader.alazy_load()
for doc in docs_lazy:
docs.append(doc)
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
LayoutParser : A Unified Toolkit for Deep
Learning Based Document Image Analysis
Zejiang Shen1( ), R
{'source': './example_data/layout-parser-paper.pdf', 'page': 0}
# TODO: Delete if async load is not implemented
Async load:
.. code-block:: python
docs = await loader.aload()
print(docs[0].page_content[:100])
print(docs[0].metadata)
.. code-block:: python
LayoutParser : A Unified Toolkit for Deep
Learning Based Document Image Analysis
Zejiang Shen1( ), R
{'source': './example_data/layout-parser-paper.pdf', 'page': 0}
""" # noqa: E501

def __init__(
self,
Expand Down
Loading

0 comments on commit f4ffd69

Please sign in to comment.