Skip to content

Commit

Permalink
Update ETL process to deal with directories
Browse files Browse the repository at this point in the history
  • Loading branch information
Alasdair Gray authored and Alasdair Gray committed Jun 22, 2022
1 parent ad6e6d6 commit 1e89146
Showing 1 changed file with 26 additions and 50 deletions.
76 changes: 26 additions & 50 deletions notebooks/ETLProcess.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,8 @@
"outputs": [],
"source": [
"def processDataFiles(idpKG, directoryLocation):\n",
" logging.debug(\"Processing directory: %s\" % directoryLocation)\n",
" logging.debug(\"Files: %s\" % os.listdir(directoryLocation))\n",
" processed = 0\n",
" for file in glob(directoryLocation + \"*.nq\"):\n",
" logging.info(\"\\tProcessing file: %s\" % file)\n",
Expand Down Expand Up @@ -716,50 +718,6 @@
" return processed"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Process Sample Data\n",
"The following method processes the sample data files provided in the GitHub repository."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def processSampleData(idpKG):\n",
" totalProcessed = 0\n",
"\n",
" # Process DisProt files\n",
" print(\"Processing DisProt...\", end='')\n",
" logging.info(\"Processing DisProt...\")\n",
" numberOfFiles = processDataFiles(idpKG, \"../scraped-data/sample/disprot/\")\n",
" print(\"%d files processed\" % numberOfFiles)\n",
" logging.info(\"%d files processed\" % numberOfFiles)\n",
" totalProcessed += numberOfFiles\n",
"\n",
" # Process MobiDB files\n",
" print(\"Processing MobiDB...\", end='')\n",
" logging.info(\"Processing MobiDB...\")\n",
" numberOfFiles = processDataFiles(idpKG, \"../scraped-data/sample/mobidb/\")\n",
" print(\"%d files processed\" % numberOfFiles)\n",
" logging.info(\"%d files processed\" % numberOfFiles)\n",
" totalProcessed += numberOfFiles\n",
"\n",
" # Process PED files\n",
" print(\"Processing PED...\", end='')\n",
" logging.info(\"Processing PED...\")\n",
" numberOfFiles = processDataFiles(idpKG, \"../scraped-data/sample/ped/\")\n",
" print(\"%d files processed\" % numberOfFiles)\n",
" logging.info(\"%d files processed\" % numberOfFiles)\n",
" totalProcessed += numberOfFiles\n",
" \n",
" return totalProcessed"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -777,16 +735,34 @@
"source": [
"def controlProcessing(selected, output):\n",
" validate=False\n",
" totalProcessed = 0\n",
" print('Processing %r' % selected)\n",
" logging.info('Processing %r' % selected)\n",
" if selected=='../scraped-data/sample/':\n",
" # Process the sample data\n",
" totalProcessed = processSampleData(idpKG)\n",
" validate=True\n",
" else:\n",
" # Process Dump\n",
" totalProcessed = processDataFiles(idpKG, selected)\n",
"\n",
" for root, dirs, files in os.walk(selected):\n",
" # Walk over directory structure processing any *.nq files\n",
" for dir in dirs:\n",
" # Process sub-directories\n",
" numberOfFiles=0\n",
" print(\"Processing directory \" + selected+dir+\"/... \", end='')\n",
" logging.info(\"Processing directory \" + selected+dir+\"/... \")\n",
" numberOfFiles = processDataFiles(idpKG, selected+dir+\"/\")\n",
" print(\"%d files processed\" % numberOfFiles)\n",
" logging.info(\"\\t%d files processed\" % numberOfFiles)\n",
" totalProcessed += numberOfFiles\n",
" # Process files in root directory\n",
" numberOfFiles=0\n",
" print(\"Processing directory %s... \" % selected, end='')\n",
" logging.info(\"Processing directory %s... \" % selected)\n",
" numberOfFiles = processDataFiles(idpKG, selected)\n",
" print(\"%d files processed\" % numberOfFiles)\n",
" logging.info(\"\\t%d files processed\" % numberOfFiles)\n",
" totalProcessed += numberOfFiles\n",
" # Don't iterate over subdirectories\n",
" break\n",
" \n",
" # Output IDP KG\n",
" idpKG.serialize('IDPKG.nq', format='nquads')\n",
" idpKG.serialize('IDPKG.jsonld', format='json-ld')\n",
Expand Down Expand Up @@ -883,7 +859,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.5"
"version": "3.8.13"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 1e89146

Please sign in to comment.