From 9cb2eeeb626bcc54a3fb5f981d9424d0e4c69148 Mon Sep 17 00:00:00 2001 From: Jukka Lampikoski Date: Wed, 11 Mar 2026 12:43:17 +0200 Subject: [PATCH] =?UTF-8?q?Korjaa=20IMAP=20s=C3=A4hk=C3=B6postien=20body-d?= =?UTF-8?q?ekoodaus=20kokonaan?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vanha fetchBody haki aina BODY[1] ja käytti haurasta regexiä koodauksen tunnistamiseen BODYSTRUCTURE:sta → monet viestit jäivät raakana base64/quoted-printable -muodossa. Uusi toteutus: - Parsii BODYSTRUCTURE:n nesting-syvyyden → oikea section-numero (TEXT/1/1.1 riippuen onko single-part/multipart/sisäkkäinen) - Joustava regex koodauksen tunnistamiseen (tukee NIL/"str" body-id/desc) - Automaattinen QP-tunnistus (=XX -koodien haku) base64:n lisäksi - extractPlainFromMultipart: jos BODY[1] palauttaa raakaa multipart-dataa boundary-rajoineen, parsii text/plain tai text/html suoraan MIME-osista - error_log debug-lokit BODYSTRUCTURE/section/encoding tiedoilla Co-Authored-By: Claude Opus 4.6 --- api.php | 222 +++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 181 insertions(+), 41 deletions(-) diff --git a/api.php b/api.php index 1dc9d8c..dc144b8 100644 --- a/api.php +++ b/api.php @@ -347,71 +347,211 @@ class ImapClient { } private function fetchBody(int $num): string { - // Try text/plain first via BODYSTRUCTURE + // Hae BODYSTRUCTURE rakenteen selvittämiseksi $resp = $this->command("FETCH {$num} BODYSTRUCTURE"); - $structLine = implode(' ', $resp); + $struct = implode(' ', $resp); + error_log("IMAP BODYSTRUCTURE msg#{$num}: " . substr($struct, 0, 500)); - // Simple approach: fetch BODY[1] (usually text/plain in multipart) - // or BODY[TEXT] for simple messages - $resp = $this->command("FETCH {$num} BODY.PEEK[1]"); - $body = $this->extractLiteral($resp); + // Etsi text/plain osa ja sen koodaus BODYSTRUCTURE:sta + // Joustava regex: param=(list|NIL), body-id=(NIL|"str"), body-desc=(NIL|"str"), encoding="str" + $pParam = '(?:\([^)]*\)|NIL)'; + $pNStr = '(?:NIL|"[^"]*")'; + $plainRx = '/"TEXT"\s+"PLAIN"\s+' . $pParam . '\s+' . $pNStr . '\s+' . $pNStr . '\s+"([^"]+)"/i'; + $htmlRx = '/"TEXT"\s+"HTML"\s+' . $pParam . '\s+' . $pNStr . '\s+' . $pNStr . '\s+"([^"]+)"/i'; - if (!$body) { - // Fallback: full text - $resp = $this->command("FETCH {$num} BODY.PEEK[TEXT]"); - $body = $this->extractLiteral($resp); + $plainEncoding = ''; + $htmlEncoding = ''; + if (preg_match($plainRx, $struct, $em)) $plainEncoding = strtoupper($em[1]); + if (preg_match($htmlRx, $struct, $em)) $htmlEncoding = strtoupper($em[1]); + + // Charset text/plain -osasta + $charset = 'utf-8'; + if (preg_match('/"TEXT"\s+"PLAIN"\s+\([^)]*"CHARSET"\s+"([^"]+)"/i', $struct, $cm)) { + $charset = strtolower($cm[1]); + } elseif (preg_match('/charset[="\s]+([^\s;"\\)]+)/i', $struct, $cm)) { + $charset = strtolower(trim($cm[1], '"')); + } + + // Päättele oikea section-numero BODYSTRUCTURE:n rakenteesta + // Yksiosainen: BODYSTRUCTURE ("TEXT" "PLAIN" ...) → BODY[TEXT] + // Multipart: BODYSTRUCTURE (("TEXT" "PLAIN" ...) ...) → BODY[1] + // Sisäkkäinen: BODYSTRUCTURE ((("TEXT" "PLAIN" ...) ...) ...) → BODY[1.1] + $sections = []; + $plainPos = stripos($struct, '"TEXT" "PLAIN"'); + if ($plainPos !== false) { + $bsPos = stripos($struct, 'BODYSTRUCTURE'); + $after = ($bsPos !== false) ? substr($struct, $bsPos + 13) : $struct; + $plainInAfter = stripos($after, '"TEXT" "PLAIN"'); + if ($plainInAfter !== false) { + $beforePlain = substr($after, 0, $plainInAfter); + $depth = substr_count($beforePlain, '(') - substr_count($beforePlain, ')'); + if ($depth <= 1) { + $sections[] = 'TEXT'; // yksiosainen viesti + } elseif ($depth === 2) { + $sections[] = '1'; // suora lapsi multipartissa + } elseif ($depth >= 3) { + $sections[] = '1.1'; // sisäkkäinen multipart + } + } + } + // Lisää fallbackit + foreach (['1', '1.1', 'TEXT'] as $fb) { + if (!in_array($fb, $sections)) $sections[] = $fb; + } + + error_log("IMAP sections to try for msg#{$num}: " . implode(', ', $sections) . " | plainEnc={$plainEncoding} htmlEnc={$htmlEncoding} charset={$charset}"); + + // Kokeile osioita järjestyksessä + $body = ''; + $usedSection = ''; + foreach ($sections as $sec) { + $resp = $this->command("FETCH {$num} BODY.PEEK[{$sec}]"); + $data = $this->extractLiteral($resp); + if ($data && strlen(trim($data)) > 0) { + $body = $data; + $usedSection = $sec; + break; + } } if (!$body) return ''; - // Detect encoding from BODYSTRUCTURE - $encoding = ''; - // Parse BODYSTRUCTURE for encoding (7BIT, BASE64, QUOTED-PRINTABLE) - if (preg_match('/"TEXT"\s+"PLAIN"\s+\([^)]*\)\s+NIL\s+NIL\s+"([^"]+)"/i', $structLine, $em)) { - $encoding = strtoupper($em[1]); - } elseif (preg_match('/BODY\[1\].*?"([^"]+)"/i', $structLine, $em)) { - $encoding = strtoupper($em[1]); + // Päättele käytettävä koodaus + $encoding = $plainEncoding; + // Jos BODYSTRUCTURE ei löytänyt text/plain koodausta, kokeile raakaa hakua + if (!$encoding) { + if (preg_match('/"(BASE64|QUOTED-PRINTABLE|7BIT|8BIT)"/i', $struct, $em)) { + $encoding = strtoupper($em[1]); + } } - // Try to detect encoding from body content if not found - if (!$encoding) { - // Check if it looks like base64 - if (preg_match('/^[A-Za-z0-9+\/=\s]+$/', trim($body)) && strlen(trim($body)) > 50) { - $decoded = @base64_decode($body, true); - if ($decoded !== false && strlen($decoded) > 0) { - // Verify it produces readable text - if (preg_match('/[\x20-\x7E\xC0-\xFF]/', $decoded)) { - $body = $decoded; - } + error_log("IMAP body msg#{$num}: section={$usedSection} encoding={$encoding} bodyLen=" . strlen($body) . " first100=" . substr($body, 0, 100)); + + // Dekoodaa sisältö + if ($encoding === 'BASE64') { + $decoded = @base64_decode($body); + if ($decoded !== false) $body = $decoded; + } elseif ($encoding === 'QUOTED-PRINTABLE') { + $body = quoted_printable_decode($body); + } + + // Jos koodausta ei tunnistettu, yritä automaattinen tunnistus + if (!$encoding || $encoding === '7BIT' || $encoding === '8BIT') { + $trimmed = trim($body); + // Tarkista näyttääkö base64:ltä + if (preg_match('/^[A-Za-z0-9+\/=\s]+$/', $trimmed) && strlen($trimmed) > 50) { + $decoded = @base64_decode($trimmed, true); + if ($decoded !== false && strlen($decoded) > 0 && preg_match('/[\x20-\x7E\xC0-\xFF]/', $decoded)) { + $body = $decoded; } } - } else { - if ($encoding === 'BASE64') { - $body = base64_decode($body); - } elseif ($encoding === 'QUOTED-PRINTABLE') { - $body = quoted_printable_decode($body); + // Tarkista näyttääkö quoted-printable:lta (sisältää =XX koodeja) + elseif (preg_match('/=[0-9A-Fa-f]{2}/', $body) && substr_count($body, '=') > 3) { + $decoded = quoted_printable_decode($body); + if (strlen($decoded) < strlen($body)) { + $body = $decoded; + } } } - // Strip HTML if it looks like HTML + // Jos body sisältää multipart-rajoja (haettiin väärä osio), yritä parsia plain text + if (preg_match('/^--[^\r\n]+\r?\n/m', $body) && preg_match('/Content-Type:/i', $body)) { + error_log("IMAP msg#{$num}: body contains MIME boundaries, trying to extract text/plain"); + $extracted = $this->extractPlainFromMultipart($body); + if ($extracted) $body = $extracted; + } + + // Riisu HTML jos sisältö on HTML:ää if (preg_match('/