Skip to content

Commit

Permalink
+ uptriever + docs/.trieve.yml
Browse files Browse the repository at this point in the history
  • Loading branch information
palkan committed Jul 17, 2024
1 parent 9e37461 commit 9e9a7c5
Show file tree
Hide file tree
Showing 2 changed files with 249 additions and 0 deletions.
226 changes: 226 additions & 0 deletions bin/uptriever
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
#!/usr/bin/env ruby

require "bundler/inline"

retried = false

begin
gemfile(retried, quiet: true) do
source "https://rubygems.org"

gem "redcarpet", "~> 3.6"
gem "nokogiri"
gem "yaml"
gem "ruby-progressbar"
gem "uri"
gem "debug"
end
rescue StandardError
raise if retried
retried = true
retry
end

class Document
attr_reader :id, :path, :link, :tags, :groups, :weight

def initialize(id, path, link, tags: nil, groups: nil, weight: 1.0)
@id = id
@path = path
@link = link
@tags = tags
@groups = groups
@weight = weight
end

def to_html
case File.extname(path)
when ".md"
markdown = Redcarpet::Markdown.new(Redcarpet::Render::HTML, autolink: true, tables: true)
markdown.render(File.read(path))
when ".html"
File.read(path)
else
raise ArgumentError, "Unsupported file type: #{path}"
end
end

def to_chunk_json
{
chunk_html: to_html,
link:,
tracking_id: id,
weight:
}.tap do
_1.merge!(tag_set: tags) if tags
_1.merge!(group_tracking_ids: groups) if groups
end
end
end

class ConfigParser
def self.parse(path) = new(path).documents

attr_reader :config_path, :root_dir

def initialize(root_dir)
@root_dir = root_dir
@config_path = File.join(root_dir, ".trieve.yml")
raise ArgumentError, ".trieve.yml is missing in the #{root_dir}" unless File.file?(config_path)
end

def groups
config["groups"] || []
end

def documents
pages = unwrap_pages(config["pages"])

defaults = (config["defaults"] || {}).transform_keys(&:to_sym)

pages.filter_map do |page|
next if config["ignore"] && config["ignore"].any? { File.fnmatch?(_1, page["source"]) }

relative_link = page["source"].sub(root_dir, "").sub(/\.[^\.]+$/, "").then do
next _1 unless config["url_prefix"]
File.join(config["url_prefix"], _1)
end

link = page["link"] || File.join(config.fetch("hostname"), relative_link)
id = page["id"] || relative_link.sub(/^\//, "").gsub(/[\/-]/, "-")

Document.new(id, page["source"], link, **defaults.merge({groups: page["groups"], tags: page["tags"], weight: page["weight"]}.compact))
end
end

private

def config = @config ||= YAML.load_file(config_path)

def unwrap_pages(items)
items.flat_map do |item|
if item.is_a?(String)
Dir.glob(File.expand_path(File.join(root_dir, item))).map { {"source" => _1} }
else
Dir.glob(File.expand_path(File.join(root_dir, item.fetch("source")))).map do
new_item = item.dup
new_item["source"] = _1
new_item
end
end
end
end
end

# Splits HTML into smaller chunks by h2 headers
class ChunkSplitter
attr_reader :chunk

def initialize(chunk)
@chunk = chunk
end

def chunks
doc = Nokogiri::HTML(chunk.fetch(:chunk_html))
# Root chunks are usually less specific, so make them weigh less
root_chunk = chunk_dup.tap {
_1[:weight] = 1.5
_1[:metadata] = {title: doc.at_css("h1").inner_text}
}
doc.xpath("//body").children.inject([root_chunk]) do |acc, child|
# Start new chunk
if child.name == "h2"
anchor = child.inner_text.downcase.gsub(/[^a-z0-9]/, "-")
acc << chunk_dup.tap {
_1.merge!(
link: "#{_1.fetch(:link)}?id=#{anchor}",
tracking_id: "#{_1.fetch(:tracking_id)}##{anchor}",
metadata: {title: child.inner_text}
)
}
next acc
end

acc.last[:chunk_html] << child.to_xhtml
acc
end
end

private

def chunk_dup = chunk.dup.tap { _1[:chunk_html] = +"" }
end

require "net/http"
require "json"

class TrieveClient
BASE_URL = "https://api.trieve.ai/api"

attr_reader :headers

def initialize(api_key, dataset)
@headers = {
"Authorization" => api_key,
"TR-Dataset" => dataset
}.freeze
end

def push_group(group, upsert: true)
group[:upsert_by_tracking_id] = upsert
perform_request("/chunk_group", group.to_json)
end

def push_chunk(chunk, upsert: true)
chunk[:upsert_by_tracking_id] = upsert
perform_request("/chunk", chunk.to_json)
end

private

def perform_request(path, data)
uri = URI.parse(BASE_URL + path)

http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = true if uri.scheme == "https"

request = Net::HTTP::Post.new(
uri.request_uri,
headers.merge("Content-Type" => "application/json")
)
request.body = data

if ENV["DRY_RUN"]
puts "[DRY RUN] Perform POST #{path}: #{data}"
return
end

response = http.request(request)

if response.code.to_i != 200
raise "Invalid response code: #{response.code} (#{response.body[100...]})"
end

JSON.parse(response.body)
end
end

docs_dir = File.expand_path(File.join(Dir.pwd, ARGV[0] || "./docs"))
config_parser = ConfigParser.new(docs_dir)
client = TrieveClient.new(ENV.fetch("TRIEVE_KEY"), ENV.fetch("TRIEVE_DATASET"))

groups = config_parser.groups
if groups.any?
progressbar = ProgressBar.create(title: "Groups", total: groups.size)
groups.each do
client.push_group(_1)
progressbar.increment
end
end

chunks = config_parser.documents.flat_map { ChunkSplitter.new(_1.to_chunk_json).chunks }
progressbar = ProgressBar.create(title: "Chunks", total: chunks.size)
chunks.each do
client.push_chunk(_1)
progressbar.increment
end
23 changes: 23 additions & 0 deletions docs/.trieve.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
ignore:
- "**/*/Readme.md"
hostname: https://docs.anycable.io

groups:
- name: Deployment
tracking_id: deployment

pages:
- "./getting_started.md"
- source: "./pro.md"
groups: ["pro"]
- "./release_notes.md"
- source: "./troubleshooting.md"
weight: 3
- "./upgrade-notes/*.md"
- "./benchmarks.md"
- "./misc/*.md"
- source: "./pro/*.md"
groups: ["pro"]
- "./guides/*.md"
- source: "./deployment/*.md"
groups: ["deployment"]

0 comments on commit 9e9a7c5

Please sign in to comment.