Korjaa IMAP sähköpostien body-dekoodaus kokonaan

Vanha fetchBody haki aina BODY[1] ja käytti haurasta regexiä
koodauksen tunnistamiseen BODYSTRUCTURE:sta → monet viestit
jäivät raakana base64/quoted-printable -muodossa.

Uusi toteutus:
- Parsii BODYSTRUCTURE:n nesting-syvyyden → oikea section-numero
  (TEXT/1/1.1 riippuen onko single-part/multipart/sisäkkäinen)
- Joustava regex koodauksen tunnistamiseen (tukee NIL/"str" body-id/desc)
- Automaattinen QP-tunnistus (=XX -koodien haku) base64:n lisäksi
- extractPlainFromMultipart: jos BODY[1] palauttaa raakaa
  multipart-dataa boundary-rajoineen, parsii text/plain tai
  text/html suoraan MIME-osista
- error_log debug-lokit BODYSTRUCTURE/section/encoding tiedoilla

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-11 12:43:17 +02:00
parent a898da119e
commit 9cb2eeeb62

212
api.php
View File

@@ -347,71 +347,211 @@ class ImapClient {
} }
private function fetchBody(int $num): string { private function fetchBody(int $num): string {
// Try text/plain first via BODYSTRUCTURE // Hae BODYSTRUCTURE rakenteen selvittämiseksi
$resp = $this->command("FETCH {$num} BODYSTRUCTURE"); $resp = $this->command("FETCH {$num} BODYSTRUCTURE");
$structLine = implode(' ', $resp); $struct = implode(' ', $resp);
error_log("IMAP BODYSTRUCTURE msg#{$num}: " . substr($struct, 0, 500));
// Simple approach: fetch BODY[1] (usually text/plain in multipart) // Etsi text/plain osa ja sen koodaus BODYSTRUCTURE:sta
// or BODY[TEXT] for simple messages // Joustava regex: param=(list|NIL), body-id=(NIL|"str"), body-desc=(NIL|"str"), encoding="str"
$resp = $this->command("FETCH {$num} BODY.PEEK[1]"); $pParam = '(?:\([^)]*\)|NIL)';
$body = $this->extractLiteral($resp); $pNStr = '(?:NIL|"[^"]*")';
$plainRx = '/"TEXT"\s+"PLAIN"\s+' . $pParam . '\s+' . $pNStr . '\s+' . $pNStr . '\s+"([^"]+)"/i';
$htmlRx = '/"TEXT"\s+"HTML"\s+' . $pParam . '\s+' . $pNStr . '\s+' . $pNStr . '\s+"([^"]+)"/i';
if (!$body) { $plainEncoding = '';
// Fallback: full text $htmlEncoding = '';
$resp = $this->command("FETCH {$num} BODY.PEEK[TEXT]"); if (preg_match($plainRx, $struct, $em)) $plainEncoding = strtoupper($em[1]);
$body = $this->extractLiteral($resp); if (preg_match($htmlRx, $struct, $em)) $htmlEncoding = strtoupper($em[1]);
// Charset text/plain -osasta
$charset = 'utf-8';
if (preg_match('/"TEXT"\s+"PLAIN"\s+\([^)]*"CHARSET"\s+"([^"]+)"/i', $struct, $cm)) {
$charset = strtolower($cm[1]);
} elseif (preg_match('/charset[="\s]+([^\s;"\\)]+)/i', $struct, $cm)) {
$charset = strtolower(trim($cm[1], '"'));
}
// Päättele oikea section-numero BODYSTRUCTURE:n rakenteesta
// Yksiosainen: BODYSTRUCTURE ("TEXT" "PLAIN" ...) → BODY[TEXT]
// Multipart: BODYSTRUCTURE (("TEXT" "PLAIN" ...) ...) → BODY[1]
// Sisäkkäinen: BODYSTRUCTURE ((("TEXT" "PLAIN" ...) ...) ...) → BODY[1.1]
$sections = [];
$plainPos = stripos($struct, '"TEXT" "PLAIN"');
if ($plainPos !== false) {
$bsPos = stripos($struct, 'BODYSTRUCTURE');
$after = ($bsPos !== false) ? substr($struct, $bsPos + 13) : $struct;
$plainInAfter = stripos($after, '"TEXT" "PLAIN"');
if ($plainInAfter !== false) {
$beforePlain = substr($after, 0, $plainInAfter);
$depth = substr_count($beforePlain, '(') - substr_count($beforePlain, ')');
if ($depth <= 1) {
$sections[] = 'TEXT'; // yksiosainen viesti
} elseif ($depth === 2) {
$sections[] = '1'; // suora lapsi multipartissa
} elseif ($depth >= 3) {
$sections[] = '1.1'; // sisäkkäinen multipart
}
}
}
// Lisää fallbackit
foreach (['1', '1.1', 'TEXT'] as $fb) {
if (!in_array($fb, $sections)) $sections[] = $fb;
}
error_log("IMAP sections to try for msg#{$num}: " . implode(', ', $sections) . " | plainEnc={$plainEncoding} htmlEnc={$htmlEncoding} charset={$charset}");
// Kokeile osioita järjestyksessä
$body = '';
$usedSection = '';
foreach ($sections as $sec) {
$resp = $this->command("FETCH {$num} BODY.PEEK[{$sec}]");
$data = $this->extractLiteral($resp);
if ($data && strlen(trim($data)) > 0) {
$body = $data;
$usedSection = $sec;
break;
}
} }
if (!$body) return ''; if (!$body) return '';
// Detect encoding from BODYSTRUCTURE // Päättele käytettävä koodaus
$encoding = ''; $encoding = $plainEncoding;
// Parse BODYSTRUCTURE for encoding (7BIT, BASE64, QUOTED-PRINTABLE) // Jos BODYSTRUCTURE ei löytänyt text/plain koodausta, kokeile raakaa hakua
if (preg_match('/"TEXT"\s+"PLAIN"\s+\([^)]*\)\s+NIL\s+NIL\s+"([^"]+)"/i', $structLine, $em)) { if (!$encoding) {
$encoding = strtoupper($em[1]); if (preg_match('/"(BASE64|QUOTED-PRINTABLE|7BIT|8BIT)"/i', $struct, $em)) {
} elseif (preg_match('/BODY\[1\].*?"([^"]+)"/i', $structLine, $em)) {
$encoding = strtoupper($em[1]); $encoding = strtoupper($em[1]);
} }
}
// Try to detect encoding from body content if not found error_log("IMAP body msg#{$num}: section={$usedSection} encoding={$encoding} bodyLen=" . strlen($body) . " first100=" . substr($body, 0, 100));
if (!$encoding) {
// Check if it looks like base64 // Dekoodaa sisältö
if (preg_match('/^[A-Za-z0-9+\/=\s]+$/', trim($body)) && strlen(trim($body)) > 50) { if ($encoding === 'BASE64') {
$decoded = @base64_decode($body, true); $decoded = @base64_decode($body);
if ($decoded !== false && strlen($decoded) > 0) { if ($decoded !== false) $body = $decoded;
// Verify it produces readable text } elseif ($encoding === 'QUOTED-PRINTABLE') {
if (preg_match('/[\x20-\x7E\xC0-\xFF]/', $decoded)) { $body = quoted_printable_decode($body);
}
// Jos koodausta ei tunnistettu, yritä automaattinen tunnistus
if (!$encoding || $encoding === '7BIT' || $encoding === '8BIT') {
$trimmed = trim($body);
// Tarkista näyttääkö base64:ltä
if (preg_match('/^[A-Za-z0-9+\/=\s]+$/', $trimmed) && strlen($trimmed) > 50) {
$decoded = @base64_decode($trimmed, true);
if ($decoded !== false && strlen($decoded) > 0 && preg_match('/[\x20-\x7E\xC0-\xFF]/', $decoded)) {
$body = $decoded;
}
}
// Tarkista näyttääkö quoted-printable:lta (sisältää =XX koodeja)
elseif (preg_match('/=[0-9A-Fa-f]{2}/', $body) && substr_count($body, '=') > 3) {
$decoded = quoted_printable_decode($body);
if (strlen($decoded) < strlen($body)) {
$body = $decoded; $body = $decoded;
} }
} }
} }
} else {
if ($encoding === 'BASE64') { // Jos body sisältää multipart-rajoja (haettiin väärä osio), yritä parsia plain text
$body = base64_decode($body); if (preg_match('/^--[^\r\n]+\r?\n/m', $body) && preg_match('/Content-Type:/i', $body)) {
} elseif ($encoding === 'QUOTED-PRINTABLE') { error_log("IMAP msg#{$num}: body contains MIME boundaries, trying to extract text/plain");
$body = quoted_printable_decode($body); $extracted = $this->extractPlainFromMultipart($body);
} if ($extracted) $body = $extracted;
} }
// Strip HTML if it looks like HTML // Riisu HTML jos sisältö on HTML:ää
if (preg_match('/<html|<body|<div|<p\b/i', $body)) { if (preg_match('/<html|<body|<div|<p\b/i', $body)) {
// Yritä ensin hakea text/html -osan koodaus
if (!$plainEncoding && $htmlEncoding === 'BASE64') {
// Body saattaa olla vielä base64-koodattua HTML:ää
$decoded = @base64_decode($body);
if ($decoded !== false && preg_match('/<html|<body|<div|<p\b/i', $decoded)) {
$body = $decoded;
}
}
$body = strip_tags($body); $body = strip_tags($body);
// Clean up whitespace
$body = preg_replace('/\n{3,}/', "\n\n", $body); $body = preg_replace('/\n{3,}/', "\n\n", $body);
$body = preg_replace('/[ \t]+\n/', "\n", $body);
} }
// Try charset conversion // Charset-muunnos
if (preg_match('/charset[="\s]+([^\s;"]+)/i', $structLine, $cm)) {
$charset = strtolower(trim($cm[1], '"'));
if ($charset && $charset !== 'utf-8') { if ($charset && $charset !== 'utf-8') {
$converted = @iconv($charset, 'UTF-8//IGNORE', $body); $converted = @iconv($charset, 'UTF-8//IGNORE', $body);
if ($converted !== false) $body = $converted; if ($converted !== false) $body = $converted;
} }
}
return trim($body); return trim($body);
} }
/**
* Jos fetchBody palautti raakaa multipart-dataa, yritetään parsia text/plain -osa siitä.
*/
private function extractPlainFromMultipart(string $raw): string {
// Etsi boundary
if (!preg_match('/^--([^\r\n]+)/m', $raw, $bm)) return '';
$boundary = $bm[1];
$parts = explode('--' . $boundary, $raw);
foreach ($parts as $part) {
$part = trim($part);
if (!$part || $part === '--') continue;
// Etsi Content-Type header
if (preg_match('/Content-Type:\s*text\/plain/i', $part)) {
// Erota headerit ja body
$split = preg_split('/\r?\n\r?\n/', $part, 2);
if (count($split) < 2) continue;
$headers = $split[0];
$body = $split[1];
// Tarkista Transfer-Encoding
if (preg_match('/Content-Transfer-Encoding:\s*base64/i', $headers)) {
$body = base64_decode($body);
} elseif (preg_match('/Content-Transfer-Encoding:\s*quoted-printable/i', $headers)) {
$body = quoted_printable_decode($body);
}
// Charset
if (preg_match('/charset[="\s]+([^\s;"\\)]+)/i', $headers, $cm)) {
$cs = strtolower(trim($cm[1], '"'));
if ($cs && $cs !== 'utf-8') {
$converted = @iconv($cs, 'UTF-8//IGNORE', $body);
if ($converted !== false) $body = $converted;
}
}
return trim($body);
}
}
// Jos text/plain ei löydy, yritä text/html ja riisu tagit
foreach ($parts as $part) {
$part = trim($part);
if (!$part || $part === '--') continue;
if (preg_match('/Content-Type:\s*text\/html/i', $part)) {
$split = preg_split('/\r?\n\r?\n/', $part, 2);
if (count($split) < 2) continue;
$headers = $split[0];
$body = $split[1];
if (preg_match('/Content-Transfer-Encoding:\s*base64/i', $headers)) {
$body = base64_decode($body);
} elseif (preg_match('/Content-Transfer-Encoding:\s*quoted-printable/i', $headers)) {
$body = quoted_printable_decode($body);
}
if (preg_match('/charset[="\s]+([^\s;"\\)]+)/i', $headers, $cm)) {
$cs = strtolower(trim($cm[1], '"'));
if ($cs && $cs !== 'utf-8') {
$converted = @iconv($cs, 'UTF-8//IGNORE', $body);
if ($converted !== false) $body = $converted;
}
}
$body = strip_tags($body);
$body = preg_replace('/\n{3,}/', "\n\n", $body);
return trim($body);
}
}
return '';
}
private function parseHeaders(string $raw): array { private function parseHeaders(string $raw): array {
$headers = []; $headers = [];
$lines = explode("\n", str_replace("\r\n", "\n", $raw)); $lines = explode("\n", str_replace("\r\n", "\n", $raw));