org-social-ios/Sources/OrgSocialKit/Network/PartialFeedFetcher.swift

import Foundation

/// Fetches only the portions of a `social.org` file that are needed for a
/// given use case, using HTTP Range requests when the server supports them.
///
/// **`fetchProfileHeader(from:)`** — Discover use case.
/// Downloads only the file header (everything before `* Posts`) to extract
/// nick, avatar, and description without downloading the posts section.
/// For a feed with 500 posts the saving is typically 90–99 % of bandwidth.
///
/// **`fetchSince(from:since:)`** — Timeline use case.
/// Downloads the header section plus posts newer than `since`, skipping all
/// older posts. Falls back to a full download when Range is unsupported or
/// when any partial request fails.
///
/// Both methods accept any HTTP/HTTPS URL and return `nil` on error.
public struct PartialFeedFetcher: Sendable {

    private let session: URLSession
    private let parser: OrgSocialParser

    /// First N bytes to probe for the file header (16 KB covers virtually all headers).
    private let headerProbeSize = 16_384
    /// Maximum bytes fetched from the file tail for recent posts (128 KB).
    private let postsTailSize = 131_072

    public init(session: URLSession = .shared) {
        self.session = session
        self.parser = OrgSocialParser()
    }

    // MARK: - Feature 2: Header-only fetch

    /// Downloads only the file header (before `* Posts`) and returns the
    /// parsed profile. Posts are always empty in the result.
    ///
    /// Uses `Range: bytes=0-16383`. Falls back to full download on HTTP 200
    /// (server ignores Range) or any other non-206 success response.
    ///
    /// Returns `nil` on network error or unsupported URL scheme.
    public func fetchProfileHeader(from url: URL) async -> OrgSocialProfile? {
        guard url.scheme == "http" || url.scheme == "https" else { return nil }

        var request = URLRequest(url: url)
        request.timeoutInterval = 10
        request.setValue("bytes=0-\(headerProbeSize - 1)", forHTTPHeaderField: "Range")

        guard let (data, response) = try? await session.data(for: request),
              let http = response as? HTTPURLResponse,
              (200..<300).contains(http.statusCode) || http.statusCode == 206 else {
            return nil
        }

        guard let text = String(data: data, encoding: .utf8) else { return nil }
        let headerText = trimToHeaderSection(text)
        var profile = parser.parse(headerText)
        profile.feedURL = url
        return profile
    }

    // MARK: - Feature 3: Date-filtered fetch for Timeline

    /// Downloads the file header + posts with `date >= since`.
    ///
    /// Strategy:
    /// 1. Fetch the first `headerProbeSize` bytes (`Range: bytes=0-N`).
    ///    The 206 response includes `Content-Range: bytes 0-N/TOTAL`, giving
    ///    the total file size without a separate probe request.
    /// 2. If Range is supported (206): fetch the last `postsTailSize` bytes
    ///    and reconstruct a valid org document from header + tail.
    /// 3. If Range is unsupported (200): the full file is already in the
    ///    response body — parse posts directly, no second request needed.
    ///
    /// The 128 KB tail covers ~640 posts of average length (200 bytes each).
    /// Falls back to a full download when any range request fails, ensuring
    /// correctness at the cost of the bandwidth saving.
    ///
    /// Returns `nil` only when even the full-download fallback fails.
    public func fetchSince(from url: URL, since: Date) async -> String? {
        guard url.scheme == "http" || url.scheme == "https" else { return nil }

        // Request the header section and piggyback the file-size probe on the
        // same round-trip: a 206 response includes Content-Range with the total
        // file size, so no separate probe request is needed.
        var headerRequest = URLRequest(url: url)
        headerRequest.timeoutInterval = 10
        headerRequest.setValue("bytes=0-\(headerProbeSize - 1)", forHTTPHeaderField: "Range")

        guard let (headerData, headerResponse) = try? await session.data(for: headerRequest),
              let http = headerResponse as? HTTPURLResponse,
              (200..<300).contains(http.statusCode) else {
            return try? await FeedFetcher(session: session).fetch(from: url)
        }

        guard let headerRaw = String(data: headerData, encoding: .utf8) else {
            return try? await FeedFetcher(session: session).fetch(from: url)
        }

        let headerSection = trimToHeaderSection(headerRaw)

        // 200 means the server ignored Range and returned the full file body.
        // Parse posts directly from the complete response — no second request.
        if http.statusCode == 200 {
            let recentPosts = extractRecentPostBlocks(from: headerRaw, since: since)
            let postsBlock = recentPosts.isEmpty
                ? "* Posts\n"
                : "* Posts\n\n" + recentPosts + "\n"
            return headerSection + "\n" + postsBlock
        }

        // 206: extract total file size from Content-Range to locate the tail.
        guard let crHeader = http.value(forHTTPHeaderField: "Content-Range"),
              let fileSize = parseContentRangeTotal(crHeader) else {
            return try? await FeedFetcher(session: session).fetch(from: url)
        }

        // If the whole file fits within the header probe we already have all
        // content — skip the tail fetch entirely.
        if fileSize <= headerProbeSize {
            let recentPosts = extractRecentPostBlocks(from: headerRaw, since: since)
            let postsBlock = recentPosts.isEmpty
                ? "* Posts\n"
                : "* Posts\n\n" + recentPosts + "\n"
            return headerSection + "\n" + postsBlock
        }

        // Fetch posts tail (the final `postsTailSize` bytes).
        let tailStart = max(0, fileSize - postsTailSize)
        guard let tailData = await downloadRange(from: url, start: tailStart, end: fileSize - 1),
              let tailRaw = String(data: tailData, encoding: .utf8) else {
            return try? await FeedFetcher(session: session).fetch(from: url)
        }

        // Safety: if the tail doesn't contain any post heading, the tail window
        // didn't reach the `* Posts` section (unusually large header or file).
        // Fall back to full download so we don't silently lose posts.
        let tailHasPosts = tailRaw.contains("\n** ") || tailRaw.hasPrefix("** ")
        if tailStart > 0 && !tailHasPosts {
            return try? await FeedFetcher(session: session).fetch(from: url)
        }

        let recentPosts = extractRecentPostBlocks(from: tailRaw, since: since)
        let postsBlock = recentPosts.isEmpty
            ? "* Posts\n"
            : "* Posts\n\n" + recentPosts + "\n"
        return headerSection + "\n" + postsBlock
    }

    // MARK: - Range helpers

    /// Downloads `bytes=[start]-[end]` from `url`. Returns `nil` on error.
    private func downloadRange(from url: URL, start: Int, end: Int) async -> Data? {
        var request = URLRequest(url: url)
        request.timeoutInterval = 20
        request.setValue("bytes=\(start)-\(end)", forHTTPHeaderField: "Range")

        guard let (data, response) = try? await session.data(for: request),
              let http = response as? HTTPURLResponse,
              (200..<300).contains(http.statusCode) || http.statusCode == 206 else {
            return nil
        }
        return data
    }

    private func parseContentRangeTotal(_ header: String) -> Int? {
        // "bytes 0-0/12345" → 12345
        guard let slashIdx = header.lastIndex(of: "/") else { return nil }
        let totalStr = String(header[header.index(after: slashIdx)...])
            .trimmingCharacters(in: .whitespaces)
        return Int(totalStr)
    }

    // MARK: - Text helpers

    /// Returns the org text up to (but not including) the `* Posts` heading.
    private func trimToHeaderSection(_ text: String) -> String {
        // Handles both "\n* Posts" and "* Posts" at start of string.
        if let range = text.range(of: "\n* Posts") {
            return String(text[..<range.lowerBound])
        }
        if text.hasPrefix("* Posts") { return "" }
        return text
    }

    /// Splits `text` into level-2 org post blocks and returns only those
    /// whose timestamp is >= `since`. Lines before the first `** ` heading
    /// are skipped (they are a partial post from the chunk boundary).
    private func extractRecentPostBlocks(from text: String, since: Date) -> String {
        var blocks: [String] = []
        var currentLines: [String] = []
        var seenFirstPost = false
        var inFencedBlock = false

        for line in text.components(separatedBy: "\n") {
            let u = line.trimmingCharacters(in: .whitespaces).uppercased()
            if !inFencedBlock, u.hasPrefix("#+BEGIN_SRC") || u.hasPrefix("#+BEGIN_QUOTE") || u.hasPrefix("#+BEGIN_EXAMPLE") {
                inFencedBlock = true
            } else if inFencedBlock, u.hasPrefix("#+END_SRC") || u.hasPrefix("#+END_QUOTE") || u.hasPrefix("#+END_EXAMPLE") {
                inFencedBlock = false
            }
            if !inFencedBlock && isPostHeadingLine(line) {
                if seenFirstPost, !currentLines.isEmpty {
                    blocks.append(currentLines.joined(separator: "\n"))
                }
                seenFirstPost = true
                currentLines = [line]
            } else if seenFirstPost {
                currentLines.append(line)
            }
        }
        if seenFirstPost, !currentLines.isEmpty {
            blocks.append(currentLines.joined(separator: "\n"))
        }

        let filtered = blocks.compactMap { block -> String? in
            guard let date = extractDateFromBlock(block), date >= since else { return nil }
            return block.trimmingCharacters(in: .newlines)
        }

        return filtered.joined(separator: "\n\n")
    }

    /// Returns `true` for a level-2 org heading (`** …`) but not level 3+ (`*** …`).
    private func isPostHeadingLine(_ line: String) -> Bool {
        guard line.hasPrefix("**") else { return false }
        if line.count == 2 { return true }
        let thirdChar = line[line.index(line.startIndex, offsetBy: 2)]
        return thirdChar == " " || thirdChar == "\t"
    }

    /// Extracts the RFC 3339 date from a post block.
    /// Tries `** TIMESTAMP` header line first, then `:ID:` in properties.
    private func extractDateFromBlock(_ block: String) -> Date? {
        let lines = block.components(separatedBy: "\n")
        guard let first = lines.first else { return nil }

        if first.hasPrefix("** ") {
            let rest = String(first.dropFirst(3)).trimmingCharacters(in: .whitespaces)
            if let ts = leadingTimestamp(from: rest) {
                return PostWriter.parseTimestamp(ts)
            }
        }

        for line in lines {
            let t = line.trimmingCharacters(in: .whitespaces)
            if t.hasPrefix(":ID:") {
                let value = String(t.dropFirst(4)).trimmingCharacters(in: .whitespaces)
                if let ts = leadingTimestamp(from: value) {
                    return PostWriter.parseTimestamp(ts)
                }
            }
        }

        return nil
    }

    private func leadingTimestamp(from s: String) -> String? {
        guard let range = s.range(
            of: #"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:[+-]\d{2}:?\d{2}|Z)"#,
            options: .regularExpression
        ) else { return nil }
        return String(s[range])
    }
}