Coverage for src / lilbee / crawler / save.py: 100%

105 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-06-28 01:01 +0000

1"""URL-to-filename mapping, metadata I/O, and per-page save-to-disk.""" 

2 

3from __future__ import annotations 

4 

5import hashlib 

6import json 

7import logging 

8import re 

9import tempfile 

10from dataclasses import dataclass 

11from pathlib import Path 

12from urllib.parse import urlparse 

13 

14from lilbee.core.config import cfg 

15from lilbee.core.security import validate_path_within 

16from lilbee.crawler.models import CrawlResult 

17 

18log = logging.getLogger(__name__) 

19 

20# Maximum filename length before truncation (most filesystems cap at 255 bytes) 

21_MAX_FILENAME_LEN = 200 

22 

23# Sentinel for index pages (trailing slash or empty path) 

24_INDEX_FILENAME = "index.md" 

25 

26# How often the crawl metadata JSON is rewritten during a streaming crawl. 

27# Markdown files are durable per-page; metadata batches to keep write volume 

28# bounded. Worst-case loss on crash is N-1 entries, recoverable from the files. 

29METADATA_FLUSH_INTERVAL = 10 

30 

31 

32def url_to_filename(url: str) -> str: 

33 """Convert a URL to a safe filesystem path ending in .md. 

34 

35 Examples: 

36 https://docs.python.org/3/tutorial/ → docs.python.org/3/tutorial/index.md 

37 https://example.com/page?q=1#frag → example.com/page.md 

38 https://example.com/ → example.com/index.md 

39 """ 

40 parsed = urlparse(url) 

41 host = parsed.hostname or "unknown" 

42 path = parsed.path.rstrip("/") 

43 

44 if not path or path == "/": 

45 return f"{host}/{_INDEX_FILENAME}" 

46 

47 # Strip leading slash 

48 path = path.lstrip("/") 

49 

50 # Neutralize path traversal segments 

51 path = re.sub(r"\.\.+", "_", path) 

52 

53 # Replace unsafe filesystem characters 

54 path = re.sub(r'[<>:"|?*]', "_", path) 

55 

56 # If the last segment has no extension, treat as directory 

57 last_segment = path.rsplit("/", 1)[-1] 

58 if "." not in last_segment: 

59 path = f"{path}/{_INDEX_FILENAME}" 

60 else: 

61 # Replace existing extension with .md 

62 path = re.sub(r"\.[^./]+$", ".md", path) 

63 

64 full = f"{host}/{path}" 

65 

66 # Truncate if too long, preserving .md extension 

67 if len(full) > _MAX_FILENAME_LEN: 

68 url_hash = hashlib.sha256(url.encode()).hexdigest()[:12] 

69 full = full[: _MAX_FILENAME_LEN - 16] + f"_{url_hash}.md" 

70 

71 return full 

72 

73 

74def _web_dir() -> Path: 

75 """Return the _web/ subdirectory under documents.""" 

76 return cfg.documents_dir / "_web" 

77 

78 

79def _crawl_meta_path() -> Path: 

80 """Path to the crawl metadata sidecar JSON.""" 

81 return cfg.data_dir / "crawl_meta.json" 

82 

83 

84@dataclass 

85class CrawlMeta: 

86 """Metadata for a single crawled URL.""" 

87 

88 file: str 

89 content_hash: str 

90 crawled_at: str 

91 

92 

93def load_crawl_metadata() -> dict[str, CrawlMeta]: 

94 """Load URL→metadata mapping from the JSON sidecar.""" 

95 path = _crawl_meta_path() 

96 if not path.exists(): 

97 return {} 

98 try: 

99 raw = json.loads(path.read_text(encoding="utf-8")) 

100 except (json.JSONDecodeError, OSError): 

101 return {} 

102 result: dict[str, CrawlMeta] = {} 

103 for url, data in raw.items(): 

104 try: 

105 result[url] = CrawlMeta(**data) 

106 except (TypeError, KeyError): 

107 log.warning("Skipping malformed crawl metadata entry: %s", url) 

108 return result 

109 

110 

111def save_crawl_metadata(meta: dict[str, CrawlMeta]) -> None: 

112 """Persist URL→metadata mapping to the JSON sidecar (atomic write).""" 

113 path = _crawl_meta_path() 

114 path.parent.mkdir(parents=True, exist_ok=True) 

115 serializable = { 

116 url: {"file": m.file, "content_hash": m.content_hash, "crawled_at": m.crawled_at} 

117 for url, m in meta.items() 

118 } 

119 tmp_name: str | None = None 

120 try: 

121 with tempfile.NamedTemporaryFile(dir=path.parent, suffix=".tmp", delete=False) as tmp: 

122 tmp_name = tmp.name 

123 tmp.write(json.dumps(serializable, indent=2).encode("utf-8")) 

124 Path(tmp_name).replace(path) 

125 except BaseException: 

126 if tmp_name is not None: 

127 Path(tmp_name).unlink(missing_ok=True) 

128 raise 

129 

130 

131def content_hash(text: str) -> str: 

132 """SHA-256 hex digest of text content.""" 

133 return hashlib.sha256(text.encode()).hexdigest() 

134 

135 

136# Reference-style nested-bracket links, e.g. Wikipedia footnote markers like 

137# ``[[1]](https://en.wikipedia.org/wiki/Foo#cite_note-1)``. The inner brackets 

138# make this a normal Markdown link with the text ``[1]``, but readers that 

139# treat ``[[...]]`` as a wikilink (Obsidian) mis-parse it as a broken wikilink 

140# followed by the literal URL. 

141_REFERENCE_LINK_RE = re.compile(r"\[\[([^\]]*)\]\]\(([^)]*)\)") 

142 

143 

144def normalize_crawled_markdown(markdown: str) -> str: 

145 """Collapse reference-style ``[[N]](url)`` links to plain ``[N](url)``. 

146 

147 This fixes the double-bracket/wikilink collision without dropping the 

148 link text or URL. Ordinary single-bracket links are left untouched. 

149 """ 

150 return _REFERENCE_LINK_RE.sub(r"[\1](\2)", markdown) 

151 

152 

153@dataclass(frozen=True) 

154class SaveOutcome: 

155 """Return value of ``_save_single_result``: written path and the hash/filename used.""" 

156 

157 path: Path 

158 filename: str 

159 content_hash: str 

160 

161 

162def _save_single_result(result: CrawlResult, meta: dict[str, CrawlMeta]) -> SaveOutcome | None: 

163 """Write one crawl result to disk if it's new or changed. 

164 

165 Returns the outcome (written path plus reusable filename/hash), or 

166 None if skipped (failure, empty markdown, unchanged hash with file 

167 on disk, or blocked by path traversal). 

168 """ 

169 if not result.success or not result.markdown.strip(): 

170 return None 

171 markdown = normalize_crawled_markdown(result.markdown) 

172 filename = url_to_filename(result.url) 

173 web_dir = _web_dir() 

174 file_path = web_dir / filename 

175 resolved_web_dir = web_dir.resolve() 

176 try: 

177 validate_path_within(file_path, resolved_web_dir) 

178 except ValueError: 

179 log.warning("Path traversal blocked: %s -> %s", result.url, file_path) 

180 return None 

181 new_hash = content_hash(markdown) 

182 prev = meta.get(result.url) 

183 if prev is not None and prev.content_hash == new_hash and file_path.exists(): 

184 log.info("Content unchanged, skipping save: %s", result.url) 

185 return None 

186 file_path.parent.mkdir(parents=True, exist_ok=True) 

187 file_path.write_text(markdown, encoding="utf-8") 

188 return SaveOutcome(path=file_path, filename=filename, content_hash=new_hash) 

189 

190 

191def _update_single_metadata( 

192 meta: dict[str, CrawlMeta], 

193 url: str, 

194 outcome: SaveOutcome, 

195 now: str, 

196) -> None: 

197 """Update the metadata dict in place with a previously-computed outcome.""" 

198 meta[url] = CrawlMeta( 

199 file=outcome.filename, 

200 content_hash=outcome.content_hash, 

201 crawled_at=now, 

202 )