Requests_IRI::remove_iunreserved_percent_encoded( array $match )

Callback function for preg_replace_callback.


Description Description

Removes sequences of percent encoded bytes that represent UTF-8 encoded characters in iunreserved


Parameters Parameters

$match

(array) (Required) PCRE match


Top ↑

Return Return

(string) Replacement


Top ↑

Source Source

File: wp-includes/Requests/IRI.php

	protected function remove_iunreserved_percent_encoded($match) {
		// As we just have valid percent encoded sequences we can just explode
		// and ignore the first member of the returned array (an empty string).
		$bytes = explode('%', $match[0]);

		// Initialize the new string (this is what will be returned) and that
		// there are no bytes remaining in the current sequence (unsurprising
		// at the first byte!).
		$string = '';
		$remaining = 0;

		// Loop over each and every byte, and set $value to its value
		for ($i = 1, $len = count($bytes); $i < $len; $i++) {
			$value = hexdec($bytes[$i]);

			// If we're the first byte of sequence:
			if (!$remaining) {
				// Start position
				$start = $i;

				// By default we are valid
				$valid = true;

				// One byte sequence:
				if ($value <= 0x7F) {
					$character = $value;
					$length = 1;
				}
				// Two byte sequence:
				elseif (($value & 0xE0) === 0xC0) {
					$character = ($value & 0x1F) << 6;
					$length = 2;
					$remaining = 1;
				}
				// Three byte sequence:
				elseif (($value & 0xF0) === 0xE0) {
					$character = ($value & 0x0F) << 12;
					$length = 3;
					$remaining = 2;
				}
				// Four byte sequence:
				elseif (($value & 0xF8) === 0xF0) {
					$character = ($value & 0x07) << 18;
					$length = 4;
					$remaining = 3;
				}
				// Invalid byte:
				else {
					$valid = false;
					$remaining = 0;
				}
			}
			// Continuation byte:
			else {
				// Check that the byte is valid, then add it to the character:
				if (($value & 0xC0) === 0x80) {
					$remaining--;
					$character |= ($value & 0x3F) << ($remaining * 6);
				}
				// If it is invalid, count the sequence as invalid and reprocess the current byte as the start of a sequence:
				else {
					$valid = false;
					$remaining = 0;
					$i--;
				}
			}

			// If we've reached the end of the current byte sequence, append it to Unicode::$data
			if (!$remaining) {
				// Percent encode anything invalid or not in iunreserved
				if (
					// Invalid sequences
					!$valid
					// Non-shortest form sequences are invalid
					|| $length > 1 && $character <= 0x7F
					|| $length > 2 && $character <= 0x7FF
					|| $length > 3 && $character <= 0xFFFF
					// Outside of range of iunreserved codepoints
					|| $character < 0x2D
					|| $character > 0xEFFFD
					// Noncharacters
					|| ($character & 0xFFFE) === 0xFFFE
					|| $character >= 0xFDD0 && $character <= 0xFDEF
					// Everything else not in iunreserved (this is all BMP)
					|| $character === 0x2F
					|| $character > 0x39 && $character < 0x41
					|| $character > 0x5A && $character < 0x61
					|| $character > 0x7A && $character < 0x7E
					|| $character > 0x7E && $character < 0xA0
					|| $character > 0xD7FF && $character < 0xF900
				) {
					for ($j = $start; $j <= $i; $j++) {
						$string .= '%' . strtoupper($bytes[$j]);
					}
				}
				else {
					for ($j = $start; $j <= $i; $j++) {
						$string .= chr(hexdec($bytes[$j]));
					}
				}
			}
		}

		// If we have any bytes left over they are invalid (i.e., we are
		// mid-way through a multi-byte sequence)
		if ($remaining) {
			for ($j = $start; $j < $len; $j++) {
				$string .= '%' . strtoupper($bytes[$j]);
			}
		}

		return $string;
	}

Top ↑

User Contributed Notes User Contributed Notes

You must log in before being able to contribute a note or feedback.