Korjaa IMAP sähköpostien body-dekoodaus kokonaan

Vanha fetchBody haki aina BODY[1] ja käytti haurasta regexiä koodauksen tunnistamiseen BODYSTRUCTURE:sta → monet viestit jäivät raakana base64/quoted-printable -muodossa. Uusi toteutus: - Parsii BODYSTRUCTURE:n nesting-syvyyden → oikea section-numero (TEXT/1/1.1 riippuen onko single-part/multipart/sisäkkäinen) - Joustava regex koodauksen tunnistamiseen (tukee NIL/"str" body-id/desc) - Automaattinen QP-tunnistus (=XX -koodien haku) base64:n lisäksi - extractPlainFromMultipart: jos BODY[1] palauttaa raakaa multipart-dataa boundary-rajoineen, parsii text/plain tai text/html suoraan MIME-osista - error_log debug-lokit BODYSTRUCTURE/section/encoding tiedoilla Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 12:43:17 +02:00
parent a898da119e
commit 9cb2eeeb62
1 changed files with 181 additions and 41 deletions
--- a/api.php
+++ b/api.php
@@ -347,71 +347,211 @@ class ImapClient {
    }

    private function fetchBody(int $num): string {
-        // Try text/plain first via BODYSTRUCTURE
+        // Hae BODYSTRUCTURE rakenteen selvittämiseksi
        $resp = $this->command("FETCH {$num} BODYSTRUCTURE");
-        $structLine = implode(' ', $resp);
+        $struct = implode(' ', $resp);
+        error_log("IMAP BODYSTRUCTURE msg#{$num}: " . substr($struct, 0, 500));

-        // Simple approach: fetch BODY[1] (usually text/plain in multipart)
-        // or BODY[TEXT] for simple messages
-        $resp = $this->command("FETCH {$num} BODY.PEEK[1]");
-        $body = $this->extractLiteral($resp);
+        // Etsi text/plain osa ja sen koodaus BODYSTRUCTURE:sta
+        // Joustava regex: param=(list|NIL), body-id=(NIL|"str"), body-desc=(NIL|"str"), encoding="str"
+        $pParam = '(?:\([^)]*\)|NIL)';
+        $pNStr  = '(?:NIL|"[^"]*")';
+        $plainRx = '/"TEXT"\s+"PLAIN"\s+' . $pParam . '\s+' . $pNStr . '\s+' . $pNStr . '\s+"([^"]+)"/i';
+        $htmlRx  = '/"TEXT"\s+"HTML"\s+'  . $pParam . '\s+' . $pNStr . '\s+' . $pNStr . '\s+"([^"]+)"/i';

-        if (!$body) {
-            // Fallback: full text
-            $resp = $this->command("FETCH {$num} BODY.PEEK[TEXT]");
-            $body = $this->extractLiteral($resp);
+        $plainEncoding = '';
+        $htmlEncoding = '';
+        if (preg_match($plainRx, $struct, $em)) $plainEncoding = strtoupper($em[1]);
+        if (preg_match($htmlRx, $struct, $em))  $htmlEncoding  = strtoupper($em[1]);
+
+        // Charset text/plain -osasta
+        $charset = 'utf-8';
+        if (preg_match('/"TEXT"\s+"PLAIN"\s+\([^)]*"CHARSET"\s+"([^"]+)"/i', $struct, $cm)) {
+            $charset = strtolower($cm[1]);
+        } elseif (preg_match('/charset[="\s]+([^\s;"\\)]+)/i', $struct, $cm)) {
+            $charset = strtolower(trim($cm[1], '"'));
+        }
+
+        // Päättele oikea section-numero BODYSTRUCTURE:n rakenteesta
+        // Yksiosainen: BODYSTRUCTURE ("TEXT" "PLAIN" ...) → BODY[TEXT]
+        // Multipart:   BODYSTRUCTURE (("TEXT" "PLAIN" ...) ...) → BODY[1]
+        // Sisäkkäinen: BODYSTRUCTURE ((("TEXT" "PLAIN" ...) ...) ...) → BODY[1.1]
+        $sections = [];
+        $plainPos = stripos($struct, '"TEXT" "PLAIN"');
+        if ($plainPos !== false) {
+            $bsPos = stripos($struct, 'BODYSTRUCTURE');
+            $after = ($bsPos !== false) ? substr($struct, $bsPos + 13) : $struct;
+            $plainInAfter = stripos($after, '"TEXT" "PLAIN"');
+            if ($plainInAfter !== false) {
+                $beforePlain = substr($after, 0, $plainInAfter);
+                $depth = substr_count($beforePlain, '(') - substr_count($beforePlain, ')');
+                if ($depth <= 1) {
+                    $sections[] = 'TEXT';  // yksiosainen viesti
+                } elseif ($depth === 2) {
+                    $sections[] = '1';     // suora lapsi multipartissa
+                } elseif ($depth >= 3) {
+                    $sections[] = '1.1';   // sisäkkäinen multipart
+                }
+            }
+        }
+        // Lisää fallbackit
+        foreach (['1', '1.1', 'TEXT'] as $fb) {
+            if (!in_array($fb, $sections)) $sections[] = $fb;
+        }
+
+        error_log("IMAP sections to try for msg#{$num}: " . implode(', ', $sections) . " | plainEnc={$plainEncoding} htmlEnc={$htmlEncoding} charset={$charset}");
+
+        // Kokeile osioita järjestyksessä
+        $body = '';
+        $usedSection = '';
+        foreach ($sections as $sec) {
+            $resp = $this->command("FETCH {$num} BODY.PEEK[{$sec}]");
+            $data = $this->extractLiteral($resp);
+            if ($data && strlen(trim($data)) > 0) {
+                $body = $data;
+                $usedSection = $sec;
+                break;
+            }
        }

        if (!$body) return '';

-        // Detect encoding from BODYSTRUCTURE
-        $encoding = '';
-        // Parse BODYSTRUCTURE for encoding (7BIT, BASE64, QUOTED-PRINTABLE)
-        if (preg_match('/"TEXT"\s+"PLAIN"\s+\([^)]*\)\s+NIL\s+NIL\s+"([^"]+)"/i', $structLine, $em)) {
-            $encoding = strtoupper($em[1]);
-        } elseif (preg_match('/BODY\[1\].*?"([^"]+)"/i', $structLine, $em)) {
+        // Päättele käytettävä koodaus
+        $encoding = $plainEncoding;
+        // Jos BODYSTRUCTURE ei löytänyt text/plain koodausta, kokeile raakaa hakua
+        if (!$encoding) {
+            if (preg_match('/"(BASE64|QUOTED-PRINTABLE|7BIT|8BIT)"/i', $struct, $em)) {
                $encoding = strtoupper($em[1]);
            }
+        }

-        // Try to detect encoding from body content if not found
-        if (!$encoding) {
-            // Check if it looks like base64
-            if (preg_match('/^[A-Za-z0-9+\/=\s]+$/', trim($body)) && strlen(trim($body)) > 50) {
-                $decoded = @base64_decode($body, true);
-                if ($decoded !== false && strlen($decoded) > 0) {
-                    // Verify it produces readable text
-                    if (preg_match('/[\x20-\x7E\xC0-\xFF]/', $decoded)) {
+        error_log("IMAP body msg#{$num}: section={$usedSection} encoding={$encoding} bodyLen=" . strlen($body) . " first100=" . substr($body, 0, 100));
+
+        // Dekoodaa sisältö
+        if ($encoding === 'BASE64') {
+            $decoded = @base64_decode($body);
+            if ($decoded !== false) $body = $decoded;
+        } elseif ($encoding === 'QUOTED-PRINTABLE') {
+            $body = quoted_printable_decode($body);
+        }
+
+        // Jos koodausta ei tunnistettu, yritä automaattinen tunnistus
+        if (!$encoding || $encoding === '7BIT' || $encoding === '8BIT') {
+            $trimmed = trim($body);
+            // Tarkista näyttääkö base64:ltä
+            if (preg_match('/^[A-Za-z0-9+\/=\s]+$/', $trimmed) && strlen($trimmed) > 50) {
+                $decoded = @base64_decode($trimmed, true);
+                if ($decoded !== false && strlen($decoded) > 0 && preg_match('/[\x20-\x7E\xC0-\xFF]/', $decoded)) {
+                    $body = $decoded;
+                }
+            }
+            // Tarkista näyttääkö quoted-printable:lta (sisältää =XX koodeja)
+            elseif (preg_match('/=[0-9A-Fa-f]{2}/', $body) && substr_count($body, '=') > 3) {
+                $decoded = quoted_printable_decode($body);
+                if (strlen($decoded) < strlen($body)) {
                    $body = $decoded;
                }
            }
        }
-        } else {
-            if ($encoding === 'BASE64') {
-                $body = base64_decode($body);
-            } elseif ($encoding === 'QUOTED-PRINTABLE') {
-                $body = quoted_printable_decode($body);
-            }
+
+        // Jos body sisältää multipart-rajoja (haettiin väärä osio), yritä parsia plain text
+        if (preg_match('/^--[^\r\n]+\r?\n/m', $body) && preg_match('/Content-Type:/i', $body)) {
+            error_log("IMAP msg#{$num}: body contains MIME boundaries, trying to extract text/plain");
+            $extracted = $this->extractPlainFromMultipart($body);
+            if ($extracted) $body = $extracted;
        }

-        // Strip HTML if it looks like HTML
+        // Riisu HTML jos sisältö on HTML:ää
        if (preg_match('/<html|<body|<div|<p\b/i', $body)) {
+            // Yritä ensin hakea text/html -osan koodaus
+            if (!$plainEncoding && $htmlEncoding === 'BASE64') {
+                // Body saattaa olla vielä base64-koodattua HTML:ää
+                $decoded = @base64_decode($body);
+                if ($decoded !== false && preg_match('/<html|<body|<div|<p\b/i', $decoded)) {
+                    $body = $decoded;
+                }
+            }
            $body = strip_tags($body);
-            // Clean up whitespace
            $body = preg_replace('/\n{3,}/', "\n\n", $body);
+            $body = preg_replace('/[ \t]+\n/', "\n", $body);
        }

-        // Try charset conversion
-        if (preg_match('/charset[="\s]+([^\s;"]+)/i', $structLine, $cm)) {
-            $charset = strtolower(trim($cm[1], '"'));
+        // Charset-muunnos
        if ($charset && $charset !== 'utf-8') {
            $converted = @iconv($charset, 'UTF-8//IGNORE', $body);
            if ($converted !== false) $body = $converted;
        }
-        }

        return trim($body);
    }

+    /**
+     * Jos fetchBody palautti raakaa multipart-dataa, yritetään parsia text/plain -osa siitä.
+     */
+    private function extractPlainFromMultipart(string $raw): string {
+        // Etsi boundary
+        if (!preg_match('/^--([^\r\n]+)/m', $raw, $bm)) return '';
+        $boundary = $bm[1];
+        $parts = explode('--' . $boundary, $raw);
+
+        foreach ($parts as $part) {
+            $part = trim($part);
+            if (!$part || $part === '--') continue;
+            // Etsi Content-Type header
+            if (preg_match('/Content-Type:\s*text\/plain/i', $part)) {
+                // Erota headerit ja body
+                $split = preg_split('/\r?\n\r?\n/', $part, 2);
+                if (count($split) < 2) continue;
+                $headers = $split[0];
+                $body = $split[1];
+                // Tarkista Transfer-Encoding
+                if (preg_match('/Content-Transfer-Encoding:\s*base64/i', $headers)) {
+                    $body = base64_decode($body);
+                } elseif (preg_match('/Content-Transfer-Encoding:\s*quoted-printable/i', $headers)) {
+                    $body = quoted_printable_decode($body);
+                }
+                // Charset
+                if (preg_match('/charset[="\s]+([^\s;"\\)]+)/i', $headers, $cm)) {
+                    $cs = strtolower(trim($cm[1], '"'));
+                    if ($cs && $cs !== 'utf-8') {
+                        $converted = @iconv($cs, 'UTF-8//IGNORE', $body);
+                        if ($converted !== false) $body = $converted;
+                    }
+                }
+                return trim($body);
+            }
+        }
+
+        // Jos text/plain ei löydy, yritä text/html ja riisu tagit
+        foreach ($parts as $part) {
+            $part = trim($part);
+            if (!$part || $part === '--') continue;
+            if (preg_match('/Content-Type:\s*text\/html/i', $part)) {
+                $split = preg_split('/\r?\n\r?\n/', $part, 2);
+                if (count($split) < 2) continue;
+                $headers = $split[0];
+                $body = $split[1];
+                if (preg_match('/Content-Transfer-Encoding:\s*base64/i', $headers)) {
+                    $body = base64_decode($body);
+                } elseif (preg_match('/Content-Transfer-Encoding:\s*quoted-printable/i', $headers)) {
+                    $body = quoted_printable_decode($body);
+                }
+                if (preg_match('/charset[="\s]+([^\s;"\\)]+)/i', $headers, $cm)) {
+                    $cs = strtolower(trim($cm[1], '"'));
+                    if ($cs && $cs !== 'utf-8') {
+                        $converted = @iconv($cs, 'UTF-8//IGNORE', $body);
+                        if ($converted !== false) $body = $converted;
+                    }
+                }
+                $body = strip_tags($body);
+                $body = preg_replace('/\n{3,}/', "\n\n", $body);
+                return trim($body);
+            }
+        }
+
+        return '';
+    }
+
    private function parseHeaders(string $raw): array {
        $headers = [];
        $lines = explode("\n", str_replace("\r\n", "\n", $raw));