CSV has 8000 rows. The code needs to pick up "n" rows (example: 1500) from the bottom of the csv, along with the header column names.
The data from csv looks like this:
Ticker | Name | Date | Open | High | Low | Close | Volume |
---|---|---|---|---|---|---|---|
HINDOILEXP | HINDUSTAN OIL EXPLORATION | 14-12-94 | 31.30 | 31.30 | 31.30 | 31.30 | 920 |
HINDOILEXP | HINDUSTAN OIL EXPLORATION | 15-12-94 | 31.30 | 31.52 | 31.30 | 31.52 | 805 |
HINDOILEXP | HINDUSTAN OIL EXPLORATION | 16-12-94 | 32.61 | 34.78 | 32.61 | 34.78 | 460 |
The script below is using deque. And is giving this message on console:
R:\PyRAM>python myres3.py
Loading data from R:/PyRAM/db as mentioned in inputs.json
Missing columns in file R:/PyRAM/db\A459.csv: Ticker, Name, Date, Open, High, Low, Close, Volume
Below is the code that is unable to do the job (which is: loading last "n" lines with the 1st header column names row)
Code:
import pandas as pdimport jsonfrom datetime import datetimeimport globimport osimport concurrent.futuresimport sysimport timeimport msvcrtfrom concurrent.futures import ProcessPoolExecutorfrom concurrent.futures import Futurefrom concurrent.futures import ThreadPoolExecutorfrom multiprocessing import Processimport mathfrom tqdm import tqdm from collections import dequeimport io# Define a function to load CSV files and return a list of DataFramesdef load_csv_files(folder_path, required_columns, max_rows): csv_files = glob.glob(os.path.join(folder_path, '*.csv')) if not csv_files: print("No CSV files found in the folder.") print("Press ENTER to wait or ESC to exit...") while True: if msvcrt.kbhit() and msvcrt.getch() == b'\x1b': exit(0) if os.path.exists(os.path.join(folder_path, '*.csv')): break continue all_data = [] for file in csv_files: try: # Use deque to store the last 'max_rows' rows rows_to_lift = deque(maxlen=max_rows) with open(file, 'r') as csv_file: for line in csv_file: rows_to_lift.append(line) # Reconstruct the CSV data with the header and lifted rows lifted_data = list(rows_to_lift) # Create a DataFrame from the lifted data data = pd.read_csv(io.StringIO(''.join(lifted_data)), header=0) # Specify that the first row is the header # Check for missing columns missing_columns = [col for col in required_columns if col not in data.columns] if missing_columns: print(f"Missing columns in file {file}: {', '.join(missing_columns)}") continue data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%y') data.sort_values('Date', inplace=True) all_data.append(data) excel_file_path = os.path.join(folder_path, f"loaded_data_{len(all_data)}.xlsx") data.to_excel(excel_file_path, index=False) print(f"DataFrame {len(all_data)} saved to {excel_file_path}") except FileNotFoundError: print(f"Error: CSV file {file} not found.") continue return all_data