import pandas as pd
import json
[docs]def read_csv(file_path, chunksize=10000):
"""
Reads a CSV file and returns an iterator over pandas DataFrame chunks.
Args:
file_path (str): Path to the CSV file.
na_values (list): List of strings to be interpreted as NA/NaN.
keep_default_na (bool): Whether to include the default NaN values.
chunksize (int): Number of rows per chunk.
Returns:
Iterator[pd.DataFrame]: DataFrame chunks.
"""
return pd.read_csv(
file_path,
na_values=["", " ", "NA", "N/A"],
keep_default_na=True,
chunksize=chunksize
)
[docs]def read_tsv(file_path, chunksize=10000):
"""
Reads a TSV file and returns an iterator over pandas DataFrame chunks.
Args:
file_path (str): Path to the TSV file.
chunksize (int): Number of rows per chunk.
Returns:
Iterator[pd.DataFrame]: DataFrame chunks.
"""
return pd.read_csv(
file_path,
sep='\t',
na_values=["", " ", "NA", "N/A"],
keep_default_na=True,
chunksize=chunksize
)
[docs]def read_json(file_path, chunksize=10000):
"""
Reads a JSON file and returns an iterator over pandas DataFrame chunks.
Now gracefully handles empty files or decode errors.
"""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read().strip()
if not content:
# File is empty => yield an empty DataFrame
yield pd.DataFrame()
return
try:
data = json.loads(content)
except json.JSONDecodeError:
# If JSON is invalid => raise a ValueError so process_file can handle it
raise ValueError(f"Invalid JSON file or decode error at: {file_path}")
# If data is not a list or dict, we can handle that scenario as well
# but typically it should be list/dict. For safety:
if not isinstance(data, (list, dict)):
raise ValueError(f"JSON content is not a list/dict: {file_path}")
# Normalize JSON data into a flat table
df = pd.json_normalize(data)
if df.empty:
yield pd.DataFrame()
return
# Yield the DataFrame in chunks if necessary
if chunksize < len(df):
for start in range(0, len(df), chunksize):
yield df.iloc[start:start + chunksize]
else:
yield df
[docs]def load_data(file_path, file_type, chunksize=10000):
"""
Loads data from a file based on its type.
Args:
file_path (str): Path to the data file.
file_type (str): Type of the file ('csv', 'tsv', 'json').
chunksize (int): Number of rows per chunk (for CSV/TSV).
Returns:
Iterator[pd.DataFrame]: Data iterator for CSV/TSV/JSON.
Raises:
ValueError: If the file type is unsupported.
"""
if file_type.lower() == 'csv':
return read_csv(file_path, chunksize=chunksize)
elif file_type.lower() == 'tsv':
return read_tsv(file_path, chunksize=chunksize)
elif file_type.lower() == 'json':
return read_json(file_path, chunksize=chunksize)
else:
raise ValueError(f"Unsupported file type: {file_type}")