Source: includes/data-port/class-sensei-import-csv-reader.php

<?php
/**
 * File containing the Sensei_Import_CSV_Reader class.
 *
 * @package sensei
 */

if ( ! defined( 'ABSPATH' ) ) {
	exit;
}

/**
 * This class is responsible for reading a CSV file.
 */
class Sensei_Import_CSV_Reader {
	const MB_ENCODING_DETECTION_ORDER = 'UTF-8, ISO-8859-1, ISO-8859-15, EUC-JP, eucJP-win, JIS, ISO-2022-JP, ASCII';

	/**
	 * The file to be read.
	 *
	 * @var SplFileObject
	 */
	private $file;

	/**
	 * Number of data lines that are already read.
	 *
	 * @var int
	 */
	private $completed_lines;

	/**
	 * Number of the total lines of the file (the first line is not included).
	 *
	 * @var int
	 */
	private $total_lines;

	/**
	 * Whether reading is completed.
	 *
	 * @var bool
	 */
	private $is_completed;

	/**
	 * Number of lines to be read by each read_lines call.
	 *
	 * @var int
	 */
	private $lines_per_batch;

	/**
	 * Sensei_Import_CSV_Reader constructor.
	 *
	 * @param string $csv_file         The CSV file name.
	 * @param int    $completed_lines  Number of data lines to skip.
	 * @param int    $lines_per_batch  Number of lines to be read on each read_lines call.
	 */
	public function __construct( $csv_file, $completed_lines = 0, $lines_per_batch = 30 ) {
		$this->file = new SplFileObject( $csv_file );
		// SplFileObject::READ_AHEAD is absolutely required for iterator_count to work, otherwise it blocks in an
		// internal infinite loop. See: https://bugs.php.net/bug.php?id=63616 - this bug is totally avoided by using
		// SplFileObject::READ_AHEAD.
		$this->file->setFlags( SplFileObject::READ_CSV | SplFileObject::READ_AHEAD );
		$this->detect_delimiter();

		// In PHP 8.0+ SplFileObject::key() doesn't work in the same way as the SplFileObject::key() in PHP 7.x,
		// so we need to use iterator_count instead to count the total number of lines in the file.
		// Also note that, as iterator_count calls SplFileObject::rewind() on $this->file, we need to subtract 1 to
		// ignore the header line and have the same result as SplFileObject::key() in PHP 7.0.
		$this->total_lines = iterator_count( $this->file ) - 1;

		// After computing the total number of lines in the file, we reset the flags to remove the
		// SplFileObject:READ_AHEAD, which breaks our code.
		$this->file->setFlags( SplFileObject::READ_CSV );

		$this->completed_lines = $completed_lines;
		$this->is_completed    = $this->completed_lines >= $this->total_lines;
		$this->lines_per_batch = $lines_per_batch;
	}

	/**
	 * Set the delimiter to read the CSV file.
	 *
	 * The delimiter detection works testing the delimiter which find more columns.
	 */
	private function detect_delimiter() {
		/**
		 * Filters the default CSV delimiter.
		 *
		 * @since 3.2.0
		 * @since 3.3.0 Updated the default to `false`, so it'll get through the delimiter detection.
		 *
		 * @hook sensei_import_csv_delimiter
		 *
		 * @param {string} $delimiter The CSV file delimiter.
		 * @return {false|string} CSV file delimiter or false to skip.
		 */
		$forced_delimiter = apply_filters( 'sensei_import_csv_delimiter', false );

		if ( $forced_delimiter ) {
			$this->file->setCsvControl( $forced_delimiter );
			return;
		}

		/**
		 * Filters the CSV delimiter options.
		 *
		 * @since 3.3.0
		 *
		 * @hook sensei_import_csv_delimiter_options
		 *
		 * @param {string[]} $delimiters The CSV file delimiter options.
		 * @return {array} CSV delimiter options.
		 */
		$delimiters         = apply_filters( 'sensei_import_csv_delimiter_options', [ ',', ';', "\t", '|' ] );
		$max_columns        = 0;
		$selected_delimiter = $delimiters[0];

		foreach ( $delimiters as $delimiter ) {
			$this->file->setCsvControl( $delimiter );

			$columns = $this->get_columns_number();

			if ( false !== $columns && $columns > $max_columns ) {
				$max_columns        = $columns;
				$selected_delimiter = $delimiter;
			}
		}

		$this->file->setCsvControl( $selected_delimiter );
	}

	/**
	 * Get the number of columns matching the header with the content.
	 *
	 * @return int|false Number of columns or
	 *                   `false` if the number of columns in the header and content doesn't match.
	 */
	private function get_columns_number() {
		$this->file->seek( 0 );
		$first_line_columns = is_countable( $this->file->current() ) ? count( $this->file->current() ) : 0;

		// Skip the header.
		$this->file->next();

		while ( ! $this->file->eof() ) {
			$second_line         = $this->file->current();
			$second_line_columns = is_countable( $second_line ) ? count( $second_line ) : 0;

			// SplFileObject->current() returns [ 0 => null ] on empty lines.
			if ( 1 === $second_line_columns && empty( $second_line[0] ) ) {
				$this->file->next();
				continue;
			}

			if ( $first_line_columns === $second_line_columns ) {
				return $first_line_columns;
			}

			return false;
		}

		return false;
	}

	/**
	 * Read a batch of lines from the CSV file. It is expected that the file has been validated before this method is
	 * called.
	 *
	 * @return array {
	 *    An array of read lines.
	 *
	 *    @type array {
	 *      An array of the values of a line.
	 *
	 *      @type $$column_name Column value.
	 *    }
	 *    @type WP_Error An error for the specific line.
	 * }
	 */
	public function read_lines() {

		if ( $this->is_completed() ) {
			return [];
		}

		$columns = $this->get_column_names();

		$this->file->seek( $this->completed_lines + 1 );
		$lines_processed = 0;
		$lines           = [];

		$convert_to_utf8 = in_array( get_option( 'blog_charset' ), [ 'utf8', 'utf-8', 'UTF8', 'UTF-8' ], true );

		while ( $lines_processed < $this->lines_per_batch ) {
			$lines_processed++;

			$indexed_line = is_array( $this->file->current() ) ? $this->file->current() : [];

			// SplFileObject->current() returns [ 0 => null ] on empty lines.
			if ( 1 < count( $indexed_line ) || ( 1 === count( $indexed_line ) && ! empty( $indexed_line[0] ) ) ) {

				if ( count( $indexed_line ) !== count( $columns ) ) {
					$lines[] = new WP_Error(
						'sensei_data_port_job_wrong_number_of_columns',
						__( 'Line has incorrect number of columns.', 'sensei-lms' )
					);
				} else {
					if ( $convert_to_utf8 ) {
						$indexed_line = array_map( [ $this, 'convert_to_utf8' ], $indexed_line );
					}

					$lines[] = array_combine( $columns, $indexed_line );
				}
			} else {
				$lines[] = [];
			}

			if ( $this->file->eof() ) {
				break;
			}

			$this->file->next();
		}

		if ( $this->file->eof() ) {
			$this->is_completed = true;
		}

		$this->completed_lines += $lines_processed;

		return $lines;
	}

	/**
	 * Normalize all strings to UTF-8.
	 *
	 * @param string $value Value to be converted to UTF-8.
	 *
	 * @return string|null
	 */
	private function convert_to_utf8( $value ) {
		if ( null === $value || '' === $value ) {
			return $value;
		}

		$use_mb = function_exists( 'mb_convert_encoding' );

		// Convert to UTF-8.
		if ( $use_mb ) {
			$encoding = mb_detect_encoding( $value, self::MB_ENCODING_DETECTION_ORDER, true );

			if ( $encoding ) {
				$value = mb_convert_encoding( $value, 'UTF-8', $encoding );
			} else {
				$value = mb_convert_encoding( $value, 'UTF-8', 'UTF-8' );
			}
		} else {
			$value = wp_check_invalid_utf8( $value, true );
		}

		return $value;
	}

	/**
	 * Get the column names of the file.
	 *
	 * @return string[]
	 */
	private function get_column_names() {
		$this->file->seek( 0 );

		$column_names = $this->file->current();

		if ( empty( $column_names ) ) {
			return [];
		}

		// Remove BOM if it's present.
		$column_names[0] = str_replace( "\xEF\xBB\xBF", '', $column_names[0] );

		// Make the column names of the CSV file case insensitive.
		return array_map(
			function ( $name ) {
				return strtolower( trim( $name ) );
			},
			$column_names
		);
	}

	/**
	 * Whether the reading of the file is completed.
	 *
	 * @return bool
	 */
	public function is_completed() {
		return $this->is_completed;
	}

	/**
	 * The number of lines that have already been read.
	 *
	 * @return int
	 */
	public function get_completed_lines() {
		return $this->completed_lines;
	}

	/**
	 * The number of total lines in the file (the first line is not included).
	 *
	 * @return int
	 */
	public function get_total_lines() {
		return $this->total_lines;
	}

	/**
	 * Validate a CSV file.
	 *
	 * @param string $file_path        The file path.
	 * @param array  $required_columns The columns that the CSV file is required to have.
	 * @param array  $optional_columns The columns that are optional.
	 *
	 * @return bool|WP_Error
	 */
	public static function validate_csv_file( $file_path, $required_columns, $optional_columns ) {
		if ( ! is_readable( $file_path ) ) {
			return new WP_Error(
				'sensei_data_port_job_unreadable_file',
				__( 'Uploaded file could not be opened.', 'sensei-lms' )
			);
		}

		try {
			$reader = new Sensei_Import_CSV_Reader( $file_path );
		} catch ( Exception $e ) {
			return new WP_Error(
				'sensei_data_port_job_unreadable_file',
				$e->getMessage()
			);
		}

		$columns = $reader->get_column_names();
		if ( empty( $columns ) ) {
			return new WP_Error(
				'sensei_data_port_job_invalid_file',
				__( 'Uploaded file was not a valid CSV.', 'sensei-lms' )
			);
		}

		$has_required_columns     = array_intersect( $required_columns, $columns );
		$missing_required_columns = array_diff( $required_columns, $has_required_columns );

		if ( ! empty( $missing_required_columns ) ) {
			return new WP_Error(
				'sensei_data_port_job_missing_columns',
				sprintf(
					// translators: Placeholder is list of columns that are missing.
					_n(
						'Source file is missing the required column: %s',
						'Source file is missing the required columns: %s',
						count( $missing_required_columns ),
						'sensei-lms'
					),
					implode( ', ', $missing_required_columns )
				)
			);
		}

		$unknown_columns = array_diff( $columns, $required_columns, $optional_columns );

		if ( ! empty( $unknown_columns ) ) {
			return new WP_Error(
				'sensei_data_port_job_unknown_columns',
				sprintf(
					// translators: Placeholder is list of columns that are unknown.
					_n(
						'The following column is unknown: %s',
						'The following columns are unknown: %s',
						count( $unknown_columns ),
						'sensei-lms'
					),
					implode( ', ', $unknown_columns )
				)
			);
		}

		while ( true ) {
			$lines = $reader->read_lines();

			$non_empty_lines = array_filter(
				$lines,
				function( $line ) {
					return ! empty( $line );
				}
			);

			if ( ! empty( $non_empty_lines ) ) {
				break;
			}

			if ( $reader->is_completed() ) {
				return new WP_Error(
					'sensei_data_port_job_empty_file',
					__( 'Uploaded file is empty.', 'sensei-lms' )
				);
			}
		}

		return true;
	}
}