Source code for src.input

import pandas as pd
import json

[docs]def read_csv(file_path, chunksize=10000): """ Reads a CSV file and returns an iterator over pandas DataFrame chunks. Args: file_path (str): Path to the CSV file. na_values (list): List of strings to be interpreted as NA/NaN. keep_default_na (bool): Whether to include the default NaN values. chunksize (int): Number of rows per chunk. Returns: Iterator[pd.DataFrame]: DataFrame chunks. """ return pd.read_csv( file_path, na_values=["", " ", "NA", "N/A"], keep_default_na=True, chunksize=chunksize )
[docs]def read_tsv(file_path, chunksize=10000): """ Reads a TSV file and returns an iterator over pandas DataFrame chunks. Args: file_path (str): Path to the TSV file. chunksize (int): Number of rows per chunk. Returns: Iterator[pd.DataFrame]: DataFrame chunks. """ return pd.read_csv( file_path, sep='\t', na_values=["", " ", "NA", "N/A"], keep_default_na=True, chunksize=chunksize )
[docs]def read_json(file_path, chunksize=10000): """ Reads a JSON file and returns an iterator over pandas DataFrame chunks. Now gracefully handles empty files or decode errors. """ with open(file_path, 'r', encoding='utf-8') as f: content = f.read().strip() if not content: # File is empty => yield an empty DataFrame yield pd.DataFrame() return try: data = json.loads(content) except json.JSONDecodeError: # If JSON is invalid => raise a ValueError so process_file can handle it raise ValueError(f"Invalid JSON file or decode error at: {file_path}") # If data is not a list or dict, we can handle that scenario as well # but typically it should be list/dict. For safety: if not isinstance(data, (list, dict)): raise ValueError(f"JSON content is not a list/dict: {file_path}") # Normalize JSON data into a flat table df = pd.json_normalize(data) if df.empty: yield pd.DataFrame() return # Yield the DataFrame in chunks if necessary if chunksize < len(df): for start in range(0, len(df), chunksize): yield df.iloc[start:start + chunksize] else: yield df
[docs]def load_data(file_path, file_type, chunksize=10000): """ Loads data from a file based on its type. Args: file_path (str): Path to the data file. file_type (str): Type of the file ('csv', 'tsv', 'json'). chunksize (int): Number of rows per chunk (for CSV/TSV). Returns: Iterator[pd.DataFrame]: Data iterator for CSV/TSV/JSON. Raises: ValueError: If the file type is unsupported. """ if file_type.lower() == 'csv': return read_csv(file_path, chunksize=chunksize) elif file_type.lower() == 'tsv': return read_tsv(file_path, chunksize=chunksize) elif file_type.lower() == 'json': return read_json(file_path, chunksize=chunksize) else: raise ValueError(f"Unsupported file type: {file_type}")