import pandas as pd
import numpy as np
import sys
import os

def select_and_save_random_rows(filenames, num_files, percentage, output_path):
    # Load and concatenate data from all provided files
    data_frames = []
    for i, filename in enumerate(filenames):
        # Read each file. Use headers only for the first file.
        if i == 0:
            data = pd.read_csv(filename)  # Automatically uses header from the first file
        else:
            # Ignore headers for subsequent files and align columns to the header of the first file
            data = pd.read_csv(filename, header=None, names=data_frames[0].columns)
        data_frames.append(data)

    # Concatenate all dataframes into one
    combined_data = pd.concat(data_frames, ignore_index=True)

    # Calculate the number of rows to select based on the percentage
    num_rows = len(combined_data)
    rows_to_select = int((percentage / 100) * num_rows)

    # Extract directory and base name
    output_dir, output_name = os.path.split(output_path)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)

    # Ensure the base name is trimmed to five characters to maintain the limit
    output_name = output_name[:5]

    for i in range(1, num_files + 1):
        # Select random rows
        selected_data = combined_data.sample(n=rows_to_select, replace=False)

        # Formulate the output filename adhering to the character limit (e.g., "001ABCDE.csv")
        output_filename = f"{i:03d}{output_name}.csv"
        full_path = os.path.join(output_dir, output_filename)

        # Save the selected rows to a new CSV file
        selected_data.to_csv(full_path, index=False)
        print(f"Data saved to {full_path}")

if __name__ == "__main__":
    # Command line arguments:
    # 1 to 4: CSV filenames
    # Next: Number of files (A)
    # Next: Selection percentage (P)
    # Last: Combined output path (Directory/BaseName)
    input_files = sys.argv[1:-3]
    num_files = int(sys.argv[-3])
    percentage = int(sys.argv[-2])
    output_path = sys.argv[-1]

    select_and_save_random_rows(input_files, num_files, percentage, output_path)

