27a9bbffac
- Add HighlighterSwift dependency for syntax-highlighted #+BEGIN_SRC blocks - New CodeBlockView: SRC (highlighted), QUOTE (accent border), EXAMPLE (monospace) - Fix OrgSocialParser.parsePostBlock: break after outer :END: so nested :PROPERTIES:/:END: inside #+BEGIN_SRC blocks no longer overwrite contentStart and eat intro text - Fix OrgSocialParser.extractText: preserve fenced block delimiters and content - Fix OrgSocialParser post-grouping: ignore ** headings inside fenced blocks - Fix PartialFeedFetcher.extractRecentPostBlocks: same fenced-block guard - Add 9 OrgBodyRenderer edge-case tests (block at start/end, multiple blocks, org headings inside block, etc.) - Add 2 parser tests for nested :PROPERTIES: inside fenced block body
265 lines
11 KiB
Swift
265 lines
11 KiB
Swift
import Foundation
|
||
|
||
/// Fetches only the portions of a `social.org` file that are needed for a
|
||
/// given use case, using HTTP Range requests when the server supports them.
|
||
///
|
||
/// **`fetchProfileHeader(from:)`** — Discover use case.
|
||
/// Downloads only the file header (everything before `* Posts`) to extract
|
||
/// nick, avatar, and description without downloading the posts section.
|
||
/// For a feed with 500 posts the saving is typically 90–99 % of bandwidth.
|
||
///
|
||
/// **`fetchSince(from:since:)`** — Timeline use case.
|
||
/// Downloads the header section plus posts newer than `since`, skipping all
|
||
/// older posts. Falls back to a full download when Range is unsupported or
|
||
/// when any partial request fails.
|
||
///
|
||
/// Both methods accept any HTTP/HTTPS URL and return `nil` on error.
|
||
public struct PartialFeedFetcher: Sendable {
|
||
|
||
private let session: URLSession
|
||
private let parser: OrgSocialParser
|
||
|
||
/// First N bytes to probe for the file header (16 KB covers virtually all headers).
|
||
private let headerProbeSize = 16_384
|
||
/// Maximum bytes fetched from the file tail for recent posts (128 KB).
|
||
private let postsTailSize = 131_072
|
||
|
||
public init(session: URLSession = .shared) {
|
||
self.session = session
|
||
self.parser = OrgSocialParser()
|
||
}
|
||
|
||
// MARK: - Feature 2: Header-only fetch
|
||
|
||
/// Downloads only the file header (before `* Posts`) and returns the
|
||
/// parsed profile. Posts are always empty in the result.
|
||
///
|
||
/// Uses `Range: bytes=0-16383`. Falls back to full download on HTTP 200
|
||
/// (server ignores Range) or any other non-206 success response.
|
||
///
|
||
/// Returns `nil` on network error or unsupported URL scheme.
|
||
public func fetchProfileHeader(from url: URL) async -> OrgSocialProfile? {
|
||
guard url.scheme == "http" || url.scheme == "https" else { return nil }
|
||
|
||
var request = URLRequest(url: url)
|
||
request.timeoutInterval = 10
|
||
request.setValue("bytes=0-\(headerProbeSize - 1)", forHTTPHeaderField: "Range")
|
||
|
||
guard let (data, response) = try? await session.data(for: request),
|
||
let http = response as? HTTPURLResponse,
|
||
(200..<300).contains(http.statusCode) || http.statusCode == 206 else {
|
||
return nil
|
||
}
|
||
|
||
guard let text = String(data: data, encoding: .utf8) else { return nil }
|
||
let headerText = trimToHeaderSection(text)
|
||
var profile = parser.parse(headerText)
|
||
profile.feedURL = url
|
||
return profile
|
||
}
|
||
|
||
// MARK: - Feature 3: Date-filtered fetch for Timeline
|
||
|
||
/// Downloads the file header + posts with `date >= since`.
|
||
///
|
||
/// Strategy:
|
||
/// 1. Fetch the first `headerProbeSize` bytes (`Range: bytes=0-N`).
|
||
/// The 206 response includes `Content-Range: bytes 0-N/TOTAL`, giving
|
||
/// the total file size without a separate probe request.
|
||
/// 2. If Range is supported (206): fetch the last `postsTailSize` bytes
|
||
/// and reconstruct a valid org document from header + tail.
|
||
/// 3. If Range is unsupported (200): the full file is already in the
|
||
/// response body — parse posts directly, no second request needed.
|
||
///
|
||
/// The 128 KB tail covers ~640 posts of average length (200 bytes each).
|
||
/// Falls back to a full download when any range request fails, ensuring
|
||
/// correctness at the cost of the bandwidth saving.
|
||
///
|
||
/// Returns `nil` only when even the full-download fallback fails.
|
||
public func fetchSince(from url: URL, since: Date) async -> String? {
|
||
guard url.scheme == "http" || url.scheme == "https" else { return nil }
|
||
|
||
// Request the header section and piggyback the file-size probe on the
|
||
// same round-trip: a 206 response includes Content-Range with the total
|
||
// file size, so no separate probe request is needed.
|
||
var headerRequest = URLRequest(url: url)
|
||
headerRequest.timeoutInterval = 10
|
||
headerRequest.setValue("bytes=0-\(headerProbeSize - 1)", forHTTPHeaderField: "Range")
|
||
|
||
guard let (headerData, headerResponse) = try? await session.data(for: headerRequest),
|
||
let http = headerResponse as? HTTPURLResponse,
|
||
(200..<300).contains(http.statusCode) else {
|
||
return try? await FeedFetcher(session: session).fetch(from: url)
|
||
}
|
||
|
||
guard let headerRaw = String(data: headerData, encoding: .utf8) else {
|
||
return try? await FeedFetcher(session: session).fetch(from: url)
|
||
}
|
||
|
||
let headerSection = trimToHeaderSection(headerRaw)
|
||
|
||
// 200 means the server ignored Range and returned the full file body.
|
||
// Parse posts directly from the complete response — no second request.
|
||
if http.statusCode == 200 {
|
||
let recentPosts = extractRecentPostBlocks(from: headerRaw, since: since)
|
||
let postsBlock = recentPosts.isEmpty
|
||
? "* Posts\n"
|
||
: "* Posts\n\n" + recentPosts + "\n"
|
||
return headerSection + "\n" + postsBlock
|
||
}
|
||
|
||
// 206: extract total file size from Content-Range to locate the tail.
|
||
guard let crHeader = http.value(forHTTPHeaderField: "Content-Range"),
|
||
let fileSize = parseContentRangeTotal(crHeader) else {
|
||
return try? await FeedFetcher(session: session).fetch(from: url)
|
||
}
|
||
|
||
// If the whole file fits within the header probe we already have all
|
||
// content — skip the tail fetch entirely.
|
||
if fileSize <= headerProbeSize {
|
||
let recentPosts = extractRecentPostBlocks(from: headerRaw, since: since)
|
||
let postsBlock = recentPosts.isEmpty
|
||
? "* Posts\n"
|
||
: "* Posts\n\n" + recentPosts + "\n"
|
||
return headerSection + "\n" + postsBlock
|
||
}
|
||
|
||
// Fetch posts tail (the final `postsTailSize` bytes).
|
||
let tailStart = max(0, fileSize - postsTailSize)
|
||
guard let tailData = await downloadRange(from: url, start: tailStart, end: fileSize - 1),
|
||
let tailRaw = String(data: tailData, encoding: .utf8) else {
|
||
return try? await FeedFetcher(session: session).fetch(from: url)
|
||
}
|
||
|
||
// Safety: if the tail doesn't contain any post heading, the tail window
|
||
// didn't reach the `* Posts` section (unusually large header or file).
|
||
// Fall back to full download so we don't silently lose posts.
|
||
let tailHasPosts = tailRaw.contains("\n** ") || tailRaw.hasPrefix("** ")
|
||
if tailStart > 0 && !tailHasPosts {
|
||
return try? await FeedFetcher(session: session).fetch(from: url)
|
||
}
|
||
|
||
let recentPosts = extractRecentPostBlocks(from: tailRaw, since: since)
|
||
let postsBlock = recentPosts.isEmpty
|
||
? "* Posts\n"
|
||
: "* Posts\n\n" + recentPosts + "\n"
|
||
return headerSection + "\n" + postsBlock
|
||
}
|
||
|
||
// MARK: - Range helpers
|
||
|
||
/// Downloads `bytes=[start]-[end]` from `url`. Returns `nil` on error.
|
||
private func downloadRange(from url: URL, start: Int, end: Int) async -> Data? {
|
||
var request = URLRequest(url: url)
|
||
request.timeoutInterval = 20
|
||
request.setValue("bytes=\(start)-\(end)", forHTTPHeaderField: "Range")
|
||
|
||
guard let (data, response) = try? await session.data(for: request),
|
||
let http = response as? HTTPURLResponse,
|
||
(200..<300).contains(http.statusCode) || http.statusCode == 206 else {
|
||
return nil
|
||
}
|
||
return data
|
||
}
|
||
|
||
private func parseContentRangeTotal(_ header: String) -> Int? {
|
||
// "bytes 0-0/12345" → 12345
|
||
guard let slashIdx = header.lastIndex(of: "/") else { return nil }
|
||
let totalStr = String(header[header.index(after: slashIdx)...])
|
||
.trimmingCharacters(in: .whitespaces)
|
||
return Int(totalStr)
|
||
}
|
||
|
||
// MARK: - Text helpers
|
||
|
||
/// Returns the org text up to (but not including) the `* Posts` heading.
|
||
private func trimToHeaderSection(_ text: String) -> String {
|
||
// Handles both "\n* Posts" and "* Posts" at start of string.
|
||
if let range = text.range(of: "\n* Posts") {
|
||
return String(text[..<range.lowerBound])
|
||
}
|
||
if text.hasPrefix("* Posts") { return "" }
|
||
return text
|
||
}
|
||
|
||
/// Splits `text` into level-2 org post blocks and returns only those
|
||
/// whose timestamp is >= `since`. Lines before the first `** ` heading
|
||
/// are skipped (they are a partial post from the chunk boundary).
|
||
private func extractRecentPostBlocks(from text: String, since: Date) -> String {
|
||
var blocks: [String] = []
|
||
var currentLines: [String] = []
|
||
var seenFirstPost = false
|
||
var inFencedBlock = false
|
||
|
||
for line in text.components(separatedBy: "\n") {
|
||
let u = line.trimmingCharacters(in: .whitespaces).uppercased()
|
||
if !inFencedBlock, u.hasPrefix("#+BEGIN_SRC") || u.hasPrefix("#+BEGIN_QUOTE") || u.hasPrefix("#+BEGIN_EXAMPLE") {
|
||
inFencedBlock = true
|
||
} else if inFencedBlock, u.hasPrefix("#+END_SRC") || u.hasPrefix("#+END_QUOTE") || u.hasPrefix("#+END_EXAMPLE") {
|
||
inFencedBlock = false
|
||
}
|
||
if !inFencedBlock && isPostHeadingLine(line) {
|
||
if seenFirstPost, !currentLines.isEmpty {
|
||
blocks.append(currentLines.joined(separator: "\n"))
|
||
}
|
||
seenFirstPost = true
|
||
currentLines = [line]
|
||
} else if seenFirstPost {
|
||
currentLines.append(line)
|
||
}
|
||
}
|
||
if seenFirstPost, !currentLines.isEmpty {
|
||
blocks.append(currentLines.joined(separator: "\n"))
|
||
}
|
||
|
||
let filtered = blocks.compactMap { block -> String? in
|
||
guard let date = extractDateFromBlock(block), date >= since else { return nil }
|
||
return block.trimmingCharacters(in: .newlines)
|
||
}
|
||
|
||
return filtered.joined(separator: "\n\n")
|
||
}
|
||
|
||
/// Returns `true` for a level-2 org heading (`** …`) but not level 3+ (`*** …`).
|
||
private func isPostHeadingLine(_ line: String) -> Bool {
|
||
guard line.hasPrefix("**") else { return false }
|
||
if line.count == 2 { return true }
|
||
let thirdChar = line[line.index(line.startIndex, offsetBy: 2)]
|
||
return thirdChar == " " || thirdChar == "\t"
|
||
}
|
||
|
||
/// Extracts the RFC 3339 date from a post block.
|
||
/// Tries `** TIMESTAMP` header line first, then `:ID:` in properties.
|
||
private func extractDateFromBlock(_ block: String) -> Date? {
|
||
let lines = block.components(separatedBy: "\n")
|
||
guard let first = lines.first else { return nil }
|
||
|
||
if first.hasPrefix("** ") {
|
||
let rest = String(first.dropFirst(3)).trimmingCharacters(in: .whitespaces)
|
||
if let ts = leadingTimestamp(from: rest) {
|
||
return PostWriter.parseTimestamp(ts)
|
||
}
|
||
}
|
||
|
||
for line in lines {
|
||
let t = line.trimmingCharacters(in: .whitespaces)
|
||
if t.hasPrefix(":ID:") {
|
||
let value = String(t.dropFirst(4)).trimmingCharacters(in: .whitespaces)
|
||
if let ts = leadingTimestamp(from: value) {
|
||
return PostWriter.parseTimestamp(ts)
|
||
}
|
||
}
|
||
}
|
||
|
||
return nil
|
||
}
|
||
|
||
private func leadingTimestamp(from s: String) -> String? {
|
||
guard let range = s.range(
|
||
of: #"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:[+-]\d{2}:?\d{2}|Z)"#,
|
||
options: .regularExpression
|
||
) else { return nil }
|
||
return String(s[range])
|
||
}
|
||
}
|