import pandas as pd
import sys
import os

def compare_csvs(csv_updated_path, csv_database_path, output_dir):
    # Read the CSV files, specifying the comma as the delimiter
    csv_updated = pd.read_csv(csv_updated_path, sep=';')
    csv_database = pd.read_csv(csv_database_path, sep=',')

    # Define the key columns for unique products
    key_columns = ['cid', 'size']

    # Filter out rows where 'cid' is empty in either CSV
    csv_updated = csv_updated[csv_updated['cid'].notna() & (csv_updated['cid'] != '')]
    csv_database = csv_database[csv_database['cid'].notna() & (csv_database['cid'] != '')]

    # Merge both CSVs on key columns (cid and size)
    merged = pd.merge(csv_updated, csv_database, on=key_columns, how='outer', suffixes=('_updated', '_database'), indicator=True)

    # Identify added, removed, and updated records
    added = merged[merged['_merge'] == 'left_only']
    removed = merged[merged['_merge'] == 'right_only']
    updated = merged[merged['_merge'] == 'both']

    # For added: Include all columns from updated CSV
    added_columns = key_columns + [col for col in csv_updated.columns if col not in key_columns]  # Take all columns from csv_updated
    added = added[[col for col in added.columns if col.replace('_updated', '') in added_columns]].rename(columns=lambda x: x.replace('_updated', ''))

    # For removed: Include all columns from database CSV
    removed_columns = key_columns + [col for col in csv_database.columns if col not in key_columns]  # Take all columns from csv_database
    removed = removed[[col for col in removed.columns if col.replace('_database', '') in removed_columns]].rename(columns=lambda x: x.replace('_database', ''))

    # For updated: Include all columns from updated CSV and keep the updated values
    updated_columns = key_columns + [col for col in csv_updated.columns if col not in key_columns]  # Take all columns from csv_updated for updated records
    updated = updated[[col for col in updated.columns if col.replace('_updated', '') in updated_columns]].rename(columns=lambda x: x.replace('_updated', ''))

    # Define columns to check for updates
    columns_to_check = ['stock', 'retail_price', 'retail_price_no_vat', 'sale_price', 'sale_price_no_vat']

    # Handle NaNs to prevent false positives in comparison (fill NaNs with a placeholder like -1 or 0)
    for col in columns_to_check:
        if f'{col}_updated' in merged.columns and f'{col}_database' in merged.columns:
            merged[f'{col}_updated'].fillna(-1, inplace=True)
            merged[f'{col}_database'].fillna(-1, inplace=True)

    # Identify updated records where any of the relevant columns have changed
    updated_conditions = [(merged[f'{col}_updated'] != merged[f'{col}_database']) for col in columns_to_check if f'{col}_updated' in merged.columns and f'{col}_database' in merged.columns]
    updated = merged[(merged['_merge'] == 'both') & pd.concat(updated_conditions, axis=1).any(axis=1)]

    # Drop the extra columns used for comparison
    updated = updated.drop(columns=['_merge'], errors='ignore')

    # Define output file paths
    added_output_path = os.path.join(output_dir, 'added.csv')
    removed_output_path = os.path.join(output_dir, 'removed.csv')
    updated_output_path = os.path.join(output_dir, 'updated.csv')

    # Save the results to CSV
    added.to_csv(added_output_path, index=False)
    removed.to_csv(removed_output_path, index=False)
    updated.to_csv(updated_output_path, index=False)

    # Return a message indicating completion
    print("Comparison complete. Added, Removed, and Updated CSV files generated.")

if __name__ == '__main__':
    # Expecting three arguments: CSV file paths and output directory
    csv_updated_path = sys.argv[1]
    csv_database_path = sys.argv[2]
    output_dir = sys.argv[3]

    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    compare_csvs(csv_updated_path, csv_database_path, output_dir)