Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support to read dbf file from zip-archive #39

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 67 additions & 44 deletions dbfread/dbf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
import os
import sys
import io
import datetime
import collections

Expand Down Expand Up @@ -93,48 +94,65 @@ def __init__(self, filename, encoding=None, ignorecase=True,
self.ignore_missing_memofile = ignore_missing_memofile
self.char_decode_errors = char_decode_errors

if recfactory is None:
self.recfactory = lambda items: items
else:
self.recfactory = recfactory

# Name part before .dbf is the table name
self.name = os.path.basename(filename)
self.name = os.path.splitext(self.name)[0].lower()
self._records = None
self._deleted = None

if ignorecase:
self.filename = ifind(filename)
if not self.filename:
raise DBFNotFound('could not find file {!r}'.format(filename))
else:
self.filename = filename

# Filled in by self._read_headers()
self.memofilename = None
self.header = None
self.fields = [] # namedtuples
self.field_names = [] # strings

with open(self.filename, mode='rb') as infile:
self._read_header(infile)
self._read_field_headers(infile)
self._check_headers()

try:
self.date = datetime.date(expand_year(self.header.year),
self.header.month,
self.header.day)
except ValueError:
# Invalid date or '\x00\x00\x00'.
self.date = None

self.memofilename = self._get_memofilename()

if load:
self.load()


try:
zfile = None
if filename.endswith(".zip"):
from zipfile import ZipFile
zfile = ZipFile(filename)
self.io = zfile
self.fname = zfile.namelist()[-1]
self.mode = "r"
else:
self.io = io
self.fname = filename
self.mode = "rb"

if recfactory is None:
self.recfactory = lambda items: items
else:
self.recfactory = recfactory

# Name part before .dbf is the table name
self.name = os.path.basename(filename)
self.name = os.path.splitext(self.name)[0].lower()
self._records = None
self._deleted = None

if ignorecase:
self.filename = ifind(filename)
if not self.filename:
raise DBFNotFound('could not find file {!r}'.format(filename))
else:
self.filename = filename

# Filled in by self._read_headers()
self.memofilename = None
self.header = None
self.fields = [] # namedtuples
self.field_names = [] # strings

with self.io.open(self.fname, mode = self.mode) as infile:
self._read_header(infile)
self._read_field_headers(infile)
self._check_headers()

try:
self.date = datetime.date(expand_year(self.header.year),
self.header.month,
self.header.day)
except ValueError:
# Invalid date or '\x00\x00\x00'.
self.date = None

self.memofilename = self._get_memofilename()

if load:
self.load()
finally:
if zfile is not None:
zfile.close()

@property
def dbversion(self):
return get_dbversion_string(self.header.dbversion)
Expand Down Expand Up @@ -271,7 +289,7 @@ def _skip_record(self, infile):
def _count_records(self, record_type=b' '):
count = 0

with open(self.filename, 'rb') as infile:
with self.io.open(self.fname, mode = self.mode) as infile:
# Skip to first record.
infile.seek(self.header.headerlen, 0)

Expand All @@ -289,7 +307,7 @@ def _count_records(self, record_type=b' '):
return count

def _iter_records(self, record_type=b' '):
with open(self.filename, 'rb') as infile, \
with self.io.open(self.fname, mode = self.mode) as infile, \
self._open_memofile() as memofile:

# Skip to first record.
Expand Down Expand Up @@ -323,6 +341,11 @@ def _iter_records(self, record_type=b' '):
else:
skip_record(infile)

def DataFrame(self):
import pandas as pd
df = pd.DataFrame()
return df.from_records(self.records)

def __iter__(self):
if self.loaded:
return list.__iter__(self._records)
Expand Down