-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
249 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,226 @@ | ||
#!/usr/bin/env ruby | ||
|
||
require "bundler/inline" | ||
|
||
retried = false | ||
|
||
begin | ||
gemfile(retried, quiet: true) do | ||
source "https://rubygems.org" | ||
|
||
gem "redcarpet", "~> 3.6" | ||
gem "nokogiri" | ||
gem "yaml" | ||
gem "ruby-progressbar" | ||
gem "uri" | ||
gem "debug" | ||
end | ||
rescue StandardError | ||
raise if retried | ||
retried = true | ||
retry | ||
end | ||
|
||
class Document | ||
attr_reader :id, :path, :link, :tags, :groups, :weight | ||
|
||
def initialize(id, path, link, tags: nil, groups: nil, weight: 1.0) | ||
@id = id | ||
@path = path | ||
@link = link | ||
@tags = tags | ||
@groups = groups | ||
@weight = weight | ||
end | ||
|
||
def to_html | ||
case File.extname(path) | ||
when ".md" | ||
markdown = Redcarpet::Markdown.new(Redcarpet::Render::HTML, autolink: true, tables: true) | ||
markdown.render(File.read(path)) | ||
when ".html" | ||
File.read(path) | ||
else | ||
raise ArgumentError, "Unsupported file type: #{path}" | ||
end | ||
end | ||
|
||
def to_chunk_json | ||
{ | ||
chunk_html: to_html, | ||
link:, | ||
tracking_id: id, | ||
weight: | ||
}.tap do | ||
_1.merge!(tag_set: tags) if tags | ||
_1.merge!(group_tracking_ids: groups) if groups | ||
end | ||
end | ||
end | ||
|
||
class ConfigParser | ||
def self.parse(path) = new(path).documents | ||
|
||
attr_reader :config_path, :root_dir | ||
|
||
def initialize(root_dir) | ||
@root_dir = root_dir | ||
@config_path = File.join(root_dir, ".trieve.yml") | ||
raise ArgumentError, ".trieve.yml is missing in the #{root_dir}" unless File.file?(config_path) | ||
end | ||
|
||
def groups | ||
config["groups"] || [] | ||
end | ||
|
||
def documents | ||
pages = unwrap_pages(config["pages"]) | ||
|
||
defaults = (config["defaults"] || {}).transform_keys(&:to_sym) | ||
|
||
pages.filter_map do |page| | ||
next if config["ignore"] && config["ignore"].any? { File.fnmatch?(_1, page["source"]) } | ||
|
||
relative_link = page["source"].sub(root_dir, "").sub(/\.[^\.]+$/, "").then do | ||
next _1 unless config["url_prefix"] | ||
File.join(config["url_prefix"], _1) | ||
end | ||
|
||
link = page["link"] || File.join(config.fetch("hostname"), relative_link) | ||
id = page["id"] || relative_link.sub(/^\//, "").gsub(/[\/-]/, "-") | ||
|
||
Document.new(id, page["source"], link, **defaults.merge({groups: page["groups"], tags: page["tags"], weight: page["weight"]}.compact)) | ||
end | ||
end | ||
|
||
private | ||
|
||
def config = @config ||= YAML.load_file(config_path) | ||
|
||
def unwrap_pages(items) | ||
items.flat_map do |item| | ||
if item.is_a?(String) | ||
Dir.glob(File.expand_path(File.join(root_dir, item))).map { {"source" => _1} } | ||
else | ||
Dir.glob(File.expand_path(File.join(root_dir, item.fetch("source")))).map do | ||
new_item = item.dup | ||
new_item["source"] = _1 | ||
new_item | ||
end | ||
end | ||
end | ||
end | ||
end | ||
|
||
# Splits HTML into smaller chunks by h2 headers | ||
class ChunkSplitter | ||
attr_reader :chunk | ||
|
||
def initialize(chunk) | ||
@chunk = chunk | ||
end | ||
|
||
def chunks | ||
doc = Nokogiri::HTML(chunk.fetch(:chunk_html)) | ||
# Root chunks are usually less specific, so make them weigh less | ||
root_chunk = chunk_dup.tap { | ||
_1[:weight] = 1.5 | ||
_1[:metadata] = {title: doc.at_css("h1").inner_text} | ||
} | ||
doc.xpath("//body").children.inject([root_chunk]) do |acc, child| | ||
# Start new chunk | ||
if child.name == "h2" | ||
anchor = child.inner_text.downcase.gsub(/[^a-z0-9]/, "-") | ||
acc << chunk_dup.tap { | ||
_1.merge!( | ||
link: "#{_1.fetch(:link)}?id=#{anchor}", | ||
tracking_id: "#{_1.fetch(:tracking_id)}##{anchor}", | ||
metadata: {title: child.inner_text} | ||
) | ||
} | ||
next acc | ||
end | ||
|
||
acc.last[:chunk_html] << child.to_xhtml | ||
acc | ||
end | ||
end | ||
|
||
private | ||
|
||
def chunk_dup = chunk.dup.tap { _1[:chunk_html] = +"" } | ||
end | ||
|
||
require "net/http" | ||
require "json" | ||
|
||
class TrieveClient | ||
BASE_URL = "https://api.trieve.ai/api" | ||
|
||
attr_reader :headers | ||
|
||
def initialize(api_key, dataset) | ||
@headers = { | ||
"Authorization" => api_key, | ||
"TR-Dataset" => dataset | ||
}.freeze | ||
end | ||
|
||
def push_group(group, upsert: true) | ||
group[:upsert_by_tracking_id] = upsert | ||
perform_request("/chunk_group", group.to_json) | ||
end | ||
|
||
def push_chunk(chunk, upsert: true) | ||
chunk[:upsert_by_tracking_id] = upsert | ||
perform_request("/chunk", chunk.to_json) | ||
end | ||
|
||
private | ||
|
||
def perform_request(path, data) | ||
uri = URI.parse(BASE_URL + path) | ||
|
||
http = Net::HTTP.new(uri.host, uri.port) | ||
http.use_ssl = true if uri.scheme == "https" | ||
|
||
request = Net::HTTP::Post.new( | ||
uri.request_uri, | ||
headers.merge("Content-Type" => "application/json") | ||
) | ||
request.body = data | ||
|
||
if ENV["DRY_RUN"] | ||
puts "[DRY RUN] Perform POST #{path}: #{data}" | ||
return | ||
end | ||
|
||
response = http.request(request) | ||
|
||
if response.code.to_i != 200 | ||
raise "Invalid response code: #{response.code} (#{response.body[100...]})" | ||
end | ||
|
||
JSON.parse(response.body) | ||
end | ||
end | ||
|
||
docs_dir = File.expand_path(File.join(Dir.pwd, ARGV[0] || "./docs")) | ||
config_parser = ConfigParser.new(docs_dir) | ||
client = TrieveClient.new(ENV.fetch("TRIEVE_KEY"), ENV.fetch("TRIEVE_DATASET")) | ||
|
||
groups = config_parser.groups | ||
if groups.any? | ||
progressbar = ProgressBar.create(title: "Groups", total: groups.size) | ||
groups.each do | ||
client.push_group(_1) | ||
progressbar.increment | ||
end | ||
end | ||
|
||
chunks = config_parser.documents.flat_map { ChunkSplitter.new(_1.to_chunk_json).chunks } | ||
progressbar = ProgressBar.create(title: "Chunks", total: chunks.size) | ||
chunks.each do | ||
client.push_chunk(_1) | ||
progressbar.increment | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
ignore: | ||
- "**/*/Readme.md" | ||
hostname: https://docs.anycable.io | ||
|
||
groups: | ||
- name: Deployment | ||
tracking_id: deployment | ||
|
||
pages: | ||
- "./getting_started.md" | ||
- source: "./pro.md" | ||
groups: ["pro"] | ||
- "./release_notes.md" | ||
- source: "./troubleshooting.md" | ||
weight: 3 | ||
- "./upgrade-notes/*.md" | ||
- "./benchmarks.md" | ||
- "./misc/*.md" | ||
- source: "./pro/*.md" | ||
groups: ["pro"] | ||
- "./guides/*.md" | ||
- source: "./deployment/*.md" | ||
groups: ["deployment"] |