Files
andros 27a9bbffac Render code blocks in posts; fix parser for nested :PROPERTIES: in fenced blocks; bump 1.3 (19)
- Add HighlighterSwift dependency for syntax-highlighted #+BEGIN_SRC blocks
- New CodeBlockView: SRC (highlighted), QUOTE (accent border), EXAMPLE (monospace)
- Fix OrgSocialParser.parsePostBlock: break after outer :END: so nested :PROPERTIES:/:END: inside #+BEGIN_SRC blocks no longer overwrite contentStart and eat intro text
- Fix OrgSocialParser.extractText: preserve fenced block delimiters and content
- Fix OrgSocialParser post-grouping: ignore ** headings inside fenced blocks
- Fix PartialFeedFetcher.extractRecentPostBlocks: same fenced-block guard
- Add 9 OrgBodyRenderer edge-case tests (block at start/end, multiple blocks, org headings inside block, etc.)
- Add 2 parser tests for nested :PROPERTIES: inside fenced block body
2026-05-22 08:12:30 +02:00

265 lines
11 KiB
Swift
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import Foundation
/// Fetches only the portions of a `social.org` file that are needed for a
/// given use case, using HTTP Range requests when the server supports them.
///
/// **`fetchProfileHeader(from:)`** Discover use case.
/// Downloads only the file header (everything before `* Posts`) to extract
/// nick, avatar, and description without downloading the posts section.
/// For a feed with 500 posts the saving is typically 9099 % of bandwidth.
///
/// **`fetchSince(from:since:)`** Timeline use case.
/// Downloads the header section plus posts newer than `since`, skipping all
/// older posts. Falls back to a full download when Range is unsupported or
/// when any partial request fails.
///
/// Both methods accept any HTTP/HTTPS URL and return `nil` on error.
public struct PartialFeedFetcher: Sendable {
private let session: URLSession
private let parser: OrgSocialParser
/// First N bytes to probe for the file header (16 KB covers virtually all headers).
private let headerProbeSize = 16_384
/// Maximum bytes fetched from the file tail for recent posts (128 KB).
private let postsTailSize = 131_072
public init(session: URLSession = .shared) {
self.session = session
self.parser = OrgSocialParser()
}
// MARK: - Feature 2: Header-only fetch
/// Downloads only the file header (before `* Posts`) and returns the
/// parsed profile. Posts are always empty in the result.
///
/// Uses `Range: bytes=0-16383`. Falls back to full download on HTTP 200
/// (server ignores Range) or any other non-206 success response.
///
/// Returns `nil` on network error or unsupported URL scheme.
public func fetchProfileHeader(from url: URL) async -> OrgSocialProfile? {
guard url.scheme == "http" || url.scheme == "https" else { return nil }
var request = URLRequest(url: url)
request.timeoutInterval = 10
request.setValue("bytes=0-\(headerProbeSize - 1)", forHTTPHeaderField: "Range")
guard let (data, response) = try? await session.data(for: request),
let http = response as? HTTPURLResponse,
(200..<300).contains(http.statusCode) || http.statusCode == 206 else {
return nil
}
guard let text = String(data: data, encoding: .utf8) else { return nil }
let headerText = trimToHeaderSection(text)
var profile = parser.parse(headerText)
profile.feedURL = url
return profile
}
// MARK: - Feature 3: Date-filtered fetch for Timeline
/// Downloads the file header + posts with `date >= since`.
///
/// Strategy:
/// 1. Fetch the first `headerProbeSize` bytes (`Range: bytes=0-N`).
/// The 206 response includes `Content-Range: bytes 0-N/TOTAL`, giving
/// the total file size without a separate probe request.
/// 2. If Range is supported (206): fetch the last `postsTailSize` bytes
/// and reconstruct a valid org document from header + tail.
/// 3. If Range is unsupported (200): the full file is already in the
/// response body parse posts directly, no second request needed.
///
/// The 128 KB tail covers ~640 posts of average length (200 bytes each).
/// Falls back to a full download when any range request fails, ensuring
/// correctness at the cost of the bandwidth saving.
///
/// Returns `nil` only when even the full-download fallback fails.
public func fetchSince(from url: URL, since: Date) async -> String? {
guard url.scheme == "http" || url.scheme == "https" else { return nil }
// Request the header section and piggyback the file-size probe on the
// same round-trip: a 206 response includes Content-Range with the total
// file size, so no separate probe request is needed.
var headerRequest = URLRequest(url: url)
headerRequest.timeoutInterval = 10
headerRequest.setValue("bytes=0-\(headerProbeSize - 1)", forHTTPHeaderField: "Range")
guard let (headerData, headerResponse) = try? await session.data(for: headerRequest),
let http = headerResponse as? HTTPURLResponse,
(200..<300).contains(http.statusCode) else {
return try? await FeedFetcher(session: session).fetch(from: url)
}
guard let headerRaw = String(data: headerData, encoding: .utf8) else {
return try? await FeedFetcher(session: session).fetch(from: url)
}
let headerSection = trimToHeaderSection(headerRaw)
// 200 means the server ignored Range and returned the full file body.
// Parse posts directly from the complete response no second request.
if http.statusCode == 200 {
let recentPosts = extractRecentPostBlocks(from: headerRaw, since: since)
let postsBlock = recentPosts.isEmpty
? "* Posts\n"
: "* Posts\n\n" + recentPosts + "\n"
return headerSection + "\n" + postsBlock
}
// 206: extract total file size from Content-Range to locate the tail.
guard let crHeader = http.value(forHTTPHeaderField: "Content-Range"),
let fileSize = parseContentRangeTotal(crHeader) else {
return try? await FeedFetcher(session: session).fetch(from: url)
}
// If the whole file fits within the header probe we already have all
// content skip the tail fetch entirely.
if fileSize <= headerProbeSize {
let recentPosts = extractRecentPostBlocks(from: headerRaw, since: since)
let postsBlock = recentPosts.isEmpty
? "* Posts\n"
: "* Posts\n\n" + recentPosts + "\n"
return headerSection + "\n" + postsBlock
}
// Fetch posts tail (the final `postsTailSize` bytes).
let tailStart = max(0, fileSize - postsTailSize)
guard let tailData = await downloadRange(from: url, start: tailStart, end: fileSize - 1),
let tailRaw = String(data: tailData, encoding: .utf8) else {
return try? await FeedFetcher(session: session).fetch(from: url)
}
// Safety: if the tail doesn't contain any post heading, the tail window
// didn't reach the `* Posts` section (unusually large header or file).
// Fall back to full download so we don't silently lose posts.
let tailHasPosts = tailRaw.contains("\n** ") || tailRaw.hasPrefix("** ")
if tailStart > 0 && !tailHasPosts {
return try? await FeedFetcher(session: session).fetch(from: url)
}
let recentPosts = extractRecentPostBlocks(from: tailRaw, since: since)
let postsBlock = recentPosts.isEmpty
? "* Posts\n"
: "* Posts\n\n" + recentPosts + "\n"
return headerSection + "\n" + postsBlock
}
// MARK: - Range helpers
/// Downloads `bytes=[start]-[end]` from `url`. Returns `nil` on error.
private func downloadRange(from url: URL, start: Int, end: Int) async -> Data? {
var request = URLRequest(url: url)
request.timeoutInterval = 20
request.setValue("bytes=\(start)-\(end)", forHTTPHeaderField: "Range")
guard let (data, response) = try? await session.data(for: request),
let http = response as? HTTPURLResponse,
(200..<300).contains(http.statusCode) || http.statusCode == 206 else {
return nil
}
return data
}
private func parseContentRangeTotal(_ header: String) -> Int? {
// "bytes 0-0/12345" 12345
guard let slashIdx = header.lastIndex(of: "/") else { return nil }
let totalStr = String(header[header.index(after: slashIdx)...])
.trimmingCharacters(in: .whitespaces)
return Int(totalStr)
}
// MARK: - Text helpers
/// Returns the org text up to (but not including) the `* Posts` heading.
private func trimToHeaderSection(_ text: String) -> String {
// Handles both "\n* Posts" and "* Posts" at start of string.
if let range = text.range(of: "\n* Posts") {
return String(text[..<range.lowerBound])
}
if text.hasPrefix("* Posts") { return "" }
return text
}
/// Splits `text` into level-2 org post blocks and returns only those
/// whose timestamp is >= `since`. Lines before the first `** ` heading
/// are skipped (they are a partial post from the chunk boundary).
private func extractRecentPostBlocks(from text: String, since: Date) -> String {
var blocks: [String] = []
var currentLines: [String] = []
var seenFirstPost = false
var inFencedBlock = false
for line in text.components(separatedBy: "\n") {
let u = line.trimmingCharacters(in: .whitespaces).uppercased()
if !inFencedBlock, u.hasPrefix("#+BEGIN_SRC") || u.hasPrefix("#+BEGIN_QUOTE") || u.hasPrefix("#+BEGIN_EXAMPLE") {
inFencedBlock = true
} else if inFencedBlock, u.hasPrefix("#+END_SRC") || u.hasPrefix("#+END_QUOTE") || u.hasPrefix("#+END_EXAMPLE") {
inFencedBlock = false
}
if !inFencedBlock && isPostHeadingLine(line) {
if seenFirstPost, !currentLines.isEmpty {
blocks.append(currentLines.joined(separator: "\n"))
}
seenFirstPost = true
currentLines = [line]
} else if seenFirstPost {
currentLines.append(line)
}
}
if seenFirstPost, !currentLines.isEmpty {
blocks.append(currentLines.joined(separator: "\n"))
}
let filtered = blocks.compactMap { block -> String? in
guard let date = extractDateFromBlock(block), date >= since else { return nil }
return block.trimmingCharacters(in: .newlines)
}
return filtered.joined(separator: "\n\n")
}
/// Returns `true` for a level-2 org heading (`** `) but not level 3+ (`*** `).
private func isPostHeadingLine(_ line: String) -> Bool {
guard line.hasPrefix("**") else { return false }
if line.count == 2 { return true }
let thirdChar = line[line.index(line.startIndex, offsetBy: 2)]
return thirdChar == " " || thirdChar == "\t"
}
/// Extracts the RFC 3339 date from a post block.
/// Tries `** TIMESTAMP` header line first, then `:ID:` in properties.
private func extractDateFromBlock(_ block: String) -> Date? {
let lines = block.components(separatedBy: "\n")
guard let first = lines.first else { return nil }
if first.hasPrefix("** ") {
let rest = String(first.dropFirst(3)).trimmingCharacters(in: .whitespaces)
if let ts = leadingTimestamp(from: rest) {
return PostWriter.parseTimestamp(ts)
}
}
for line in lines {
let t = line.trimmingCharacters(in: .whitespaces)
if t.hasPrefix(":ID:") {
let value = String(t.dropFirst(4)).trimmingCharacters(in: .whitespaces)
if let ts = leadingTimestamp(from: value) {
return PostWriter.parseTimestamp(ts)
}
}
}
return nil
}
private func leadingTimestamp(from s: String) -> String? {
guard let range = s.range(
of: #"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:[+-]\d{2}:?\d{2}|Z)"#,
options: .regularExpression
) else { return nil }
return String(s[range])
}
}