Files
andros b15dac53f1 Parser: stop truncating posts on inline * heading in post bodies
OrgSocialParser.parsePosts() collected lines after `* Posts` until the
next top-level heading, then stopped. That assumption broke the moment
a user wrote an Org heading inside a post body — one such post on
2025-10-14 hid every post that followed it (six months of content)
until the same user tried to see their own freshly-published post and
noticed it was missing.

Fix: read from `* Posts` to end of file. Org Social spec defines no
further top-level sections, and `* foo` inside a body is body content.
Regression test covers this case.

Also make the own profile always take precedence when the TimelineFetcher
merges feeds — the caller's bypassCache copy beats any relay-wide
download that may be stale.
2026-04-21 17:40:01 +02:00

317 lines
13 KiB
Swift

import Foundation
/// Parses the raw text of a `social.org` file into an `OrgSocialProfile`.
///
/// The parser is lenient: missing required fields (TITLE, NICK) produce `nil` values
/// rather than errors. Invalid property values (wrong format) are silently discarded,
/// matching the behaviour of the reference Elisp client.
public struct OrgSocialParser: Sendable {
public init() {}
// MARK: - Public API
/// Parses the raw UTF-8 content of a `social.org` file.
///
/// - Parameter content: Full text of the file.
/// - Returns: A populated `OrgSocialProfile`. `feedURL` is always `nil` here;
/// set it on the result if you know the source URL.
public func parse(_ content: String) -> OrgSocialProfile {
let lines = content.components(separatedBy: "\n")
let headers = parseHeaders(lines: lines)
let posts = parsePosts(lines: lines)
return OrgSocialProfile(
title: headers["TITLE"]?.first,
nick: headers["NICK"]?.first,
description: headers["DESCRIPTION"]?.first,
avatar: headers["AVATAR"]?.first.flatMap { URL(string: $0) },
links: headers["LINK"]?.compactMap { URL(string: $0) } ?? [],
location: headers["LOCATION"]?.first,
birthday: headers["BIRTHDAY"]?.first,
languages: headers["LANGUAGE"]?.first
.map { $0.split(separator: " ").map(String.init).filter { !$0.isEmpty } } ?? [],
feedURL: nil,
pinned: headers["PINNED"]?.first,
follows: headers["FOLLOW"]?.compactMap { parseFollow($0) } ?? [],
groups: headers["GROUP"]?.compactMap { parseGroup($0) } ?? [],
contacts: headers["CONTACT"] ?? [],
posts: posts
)
}
// MARK: - Header parsing
/// Extracts all `#+KEYWORD: value` entries from the header section.
/// Stops at `* Posts` or any other top-level heading.
/// Ignores lines inside `#+BEGIN_SRC`, `#+BEGIN_EXAMPLE`, and `#+BEGIN_QUOTE` blocks.
private func parseHeaders(lines: [String]) -> [String: [String]] {
var result: [String: [String]] = [:]
var inBlock = false
for line in lines {
let t = line.trimmingCharacters(in: .whitespaces)
let u = t.uppercased()
if u.hasPrefix("#+BEGIN_SRC") || u.hasPrefix("#+BEGIN_EXAMPLE") || u.hasPrefix("#+BEGIN_QUOTE") {
inBlock = true; continue
}
if u.hasPrefix("#+END_SRC") || u.hasPrefix("#+END_EXAMPLE") || u.hasPrefix("#+END_QUOTE") {
inBlock = false; continue
}
if inBlock { continue }
// Stop when we hit any top-level heading
if isTopLevelHeading(t) { break }
guard t.hasPrefix("#+"), let colonIdx = t.firstIndex(of: ":") else { continue }
let keyword = String(t[t.index(t.startIndex, offsetBy: 2)..<colonIdx]).uppercased()
guard !keyword.isEmpty, keyword.allSatisfy({ $0.isLetter || $0 == "_" }) else { continue }
let value = String(t[t.index(after: colonIdx)...]).trimmingCharacters(in: .whitespaces)
if !value.isEmpty {
result[keyword, default: []].append(value)
}
}
return result
}
// MARK: - Follow / Group
private func parseFollow(_ line: String) -> OrgSocialFollow? {
let parts = line.split(separator: " ", omittingEmptySubsequences: true).map(String.init)
guard !parts.isEmpty else { return nil }
if isHTTP(parts[0]) {
// Format: "https://feed.url"
return URL(string: parts[0]).map { OrgSocialFollow(name: nil, url: $0) }
}
if parts.count >= 2, isHTTP(parts[1]), let url = URL(string: parts[1]) {
// Format: "nick https://feed.url"
return OrgSocialFollow(name: parts[0], url: url)
}
return nil
}
private func parseGroup(_ line: String) -> OrgSocialGroup? {
let t = line.trimmingCharacters(in: .whitespaces)
guard let lastSpace = t.lastIndex(of: " ") else { return nil }
let urlStr = String(t[t.index(after: lastSpace)...])
let name = String(t[..<lastSpace]).trimmingCharacters(in: .whitespaces)
guard !name.isEmpty, isHTTP(urlStr), let url = URL(string: urlStr) else { return nil }
return OrgSocialGroup(name: name, relayURL: url)
}
// MARK: - Post parsing
private func parsePosts(lines: [String]) -> [OrgSocialPost] {
// Find the "* Posts" section
guard let postsIdx = lines.firstIndex(where: { $0.trimmingCharacters(in: .whitespaces) == "* Posts" }) else {
return []
}
// Everything after `* Posts` is the posts section. Org Social spec
// defines no further top-level sections, and users are free to use
// `* heading` lines inside post bodies so we must NOT treat them
// as section boundaries. Breaking on the first top-level heading
// silently dropped every post that followed a body containing one.
let sectionLines = Array(lines[(postsIdx + 1)..<lines.count])
// Group into post blocks at lines starting with "** "
var blocks: [[String]] = []
var current: [String] = []
for line in sectionLines {
if isPostHeading(line) {
if !current.isEmpty { blocks.append(current) }
current = [line]
} else {
current.append(line)
}
}
if !current.isEmpty { blocks.append(current) }
return blocks.compactMap { parsePostBlock($0) }
}
/// Parses a single post block (array of lines starting with the `** ` heading).
private func parsePostBlock(_ lines: [String]) -> OrgSocialPost? {
guard !lines.isEmpty else { return nil }
// --- ID from header line ---
let headerRest = lines[0].hasPrefix("** ")
? String(lines[0].dropFirst(3)).trimmingCharacters(in: .whitespaces)
: ""
let idFromHeader = extractLeadingTimestamp(from: headerRest)
// --- Properties block ---
var properties: [String: String] = [:]
var idFromProperties: String? = nil
var contentStart = 1 // default if no :PROPERTIES: block
var inProps = false
for (i, line) in lines[1...].enumerated() {
let t = line.trimmingCharacters(in: .whitespaces)
if t == ":PROPERTIES:" { inProps = true; continue }
if t == ":END:" {
// i is 0-based index within lines[1...], so actual index in lines is i+1.
// Content starts at i+2.
contentStart = i + 2
inProps = false
continue
}
if inProps, let (key, val) = parsePropLine(t) {
if key == "ID" {
idFromProperties = isValidTimestamp(val) ? val : nil
} else {
properties[key] = val
}
}
}
// Header ID takes priority over property ID (spec v1.6)
guard let timestamp = idFromHeader ?? idFromProperties else { return nil }
guard let date = parseDate(timestamp) else { return nil }
// Skip scheduled posts (future timestamps)
guard date <= Date() else { return nil }
// --- Text content ---
let rawLines = contentStart < lines.count ? Array(lines[contentStart...]) : []
let text = extractText(from: rawLines)
// --- Validated properties ---
return OrgSocialPost(
timestamp: timestamp,
date: date,
text: text,
lang: validateLang(properties["LANG"]),
tags: parseTags(properties["TAGS"]),
client: validateShortText(properties["CLIENT"]),
replyTo: validateURLTimestamp(properties["REPLY_TO"]),
include: validateURLTimestamp(properties["INCLUDE"]),
pollEnd: properties["POLL_END"].flatMap { parseDate($0) },
pollOption: validateShortText(properties["POLL_OPTION"]),
group: validateGroupProp(properties["GROUP"]),
mood: validateShortText(properties["MOOD"]),
migration: properties["MIGRATION"],
visibility: validateVisibility(properties["VISIBILITY"])
)
}
/// Parses a single `:KEY: value` line from inside a `:PROPERTIES:` block.
private func parsePropLine(_ trimmed: String) -> (key: String, value: String)? {
guard trimmed.hasPrefix(":"),
trimmed != ":PROPERTIES:",
trimmed != ":END:" else { return nil }
let withoutFirst = String(trimmed.dropFirst(1))
guard let secondColon = withoutFirst.firstIndex(of: ":") else { return nil }
let key = String(withoutFirst[..<secondColon]).uppercased()
let value = String(withoutFirst[withoutFirst.index(after: secondColon)...])
.trimmingCharacters(in: .whitespaces)
guard !key.isEmpty, !value.isEmpty else { return nil }
return (key, value)
}
/// Extracts text content, filtering Org Mode comment and property lines.
/// Mirrors the reference Elisp implementation filter rules.
private func extractText(from lines: [String]) -> String {
lines
.filter { line in
let t = line.trimmingCharacters(in: .whitespaces)
return !t.hasPrefix("#") && !t.hasPrefix(":")
}
.joined(separator: "\n")
.trimmingCharacters(in: .whitespacesAndNewlines)
}
// MARK: - Property validation
private func validateLang(_ s: String?) -> String? {
guard let s else { return nil }
let ok = s.range(of: #"^[a-z]{2,5}(-[a-z]{2,3})?$"#, options: .regularExpression) != nil
return ok ? s : nil
}
private func parseTags(_ s: String?) -> [String] {
guard let s else { return [] }
let ok = s.range(of: #"^[a-zA-Z0-9_-]+(\s+[a-zA-Z0-9_-]+)*$"#, options: .regularExpression) != nil
guard ok else { return [] }
return s.split(separator: " ").map(String.init)
}
private func validateShortText(_ s: String?) -> String? {
guard let s, s.count < 200, !s.contains("\n"), !s.contains("\r") else { return nil }
return s
}
private func validateURLTimestamp(_ s: String?) -> String? {
guard let s else { return nil }
let ok = s.range(
of: #"^https?://.+#\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}[+-]\d{2}"#,
options: .regularExpression
) != nil
return ok ? s : nil
}
private func validateGroupProp(_ s: String?) -> String? {
guard let s else { return nil }
let parts = s.split(separator: " ", omittingEmptySubsequences: true)
guard parts.count >= 2, isHTTP(String(parts.last!)) else { return nil }
return s
}
private func validateVisibility(_ s: String?) -> String? {
guard let s else { return nil }
return (s == "public" || s == "mention") ? s : nil
}
// MARK: - Date / timestamp utilities
private func parseDate(_ timestamp: String) -> Date? {
// Delegates to the shared PostWriter helper so emission and parsing
// stay in sync (handles compact `+0200`, colon `+02:00`, and `Z`).
PostWriter.parseTimestamp(timestamp)
}
/// Returns `true` if the string is a valid RFC 3339 post timestamp.
private func isValidTimestamp(_ s: String) -> Bool {
s.range(
of: #"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:[+-]\d{2}:?\d{2}|Z)$"#,
options: .regularExpression
) != nil
}
/// Extracts a leading RFC 3339 timestamp from the start of a string.
/// Used when the ID is embedded in the `** ` heading.
private func extractLeadingTimestamp(from s: String) -> String? {
guard let range = s.range(
of: #"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:[+-]\d{2}:?\d{2}|Z)"#,
options: .regularExpression
) else { return nil }
return String(s[range])
}
// MARK: - Line classification helpers
/// Returns `true` for a top-level heading (`* ` but NOT `** `).
private func isTopLevelHeading(_ line: String) -> Bool {
guard line.hasPrefix("* ") else { return false }
return !line.hasPrefix("** ")
}
/// Returns `true` for a level-2 heading (`** ...` but NOT `*** ...`).
private func isPostHeading(_ line: String) -> Bool {
guard line.hasPrefix("**") else { return false }
if line.count == 2 { return true }
return line[line.index(line.startIndex, offsetBy: 2)] != "*"
}
private func isHTTP(_ s: String) -> Bool {
s.hasPrefix("http://") || s.hasPrefix("https://")
}
}