From 1bb7517ee3c78ce7c298f962ca64cd7e65d76a4a Mon Sep 17 00:00:00 2001 From: Joseph Lewis III Date: Thu, 10 Oct 2019 08:15:52 -0700 Subject: [PATCH] first pass at fixing #4 --- CHANGELOG.md | 6 +++++- docs/index.asciidoc | 13 +++++++++++++ lib/logstash/inputs/cloud_storage/client.rb | 15 +++++++++++++-- lib/logstash/inputs/google_cloud_storage.rb | 3 ++- logstash-input-google_cloud_storage.gemspec | 2 +- spec/inputs/cloud_storage/client_spec.rb | 2 +- spec/inputs/google_cloud_storage_spec.rb | 3 ++- 7 files changed, 37 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b5afdd9..4d45771 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.12.0 + +- Added support for `file_prefix` option for server-side filtering + ## 0.11.0 - Change gzip file detection to use mime type instead of extension @@ -10,7 +14,7 @@ ## 0.9.0 - Initial release -- File inclusion/exclusion by +- File inclusion/exclusion by - regex - processed database - metadata key diff --git a/docs/index.asciidoc b/docs/index.asciidoc index d3fade7..444469a 100644 --- a/docs/index.asciidoc +++ b/docs/index.asciidoc @@ -160,6 +160,7 @@ This plugin supports the following configuration options plus the <> |<>|No | <> |<>|No | <> |<>|No +| <> |<>|No | <> |<>|No | <> |<>|No | <> |<>|No @@ -206,6 +207,18 @@ The number of seconds between looking for new files in your bucket. A regex pattern to filter files. Only files with names matching this will be considered. All files match by default. +[id="plugins-{type}s-{plugin}-file_prefix"] +===== `file_prefix` + +added[0.12.0] + + * Value type is <> + * Default is: `` + +A prefix filter applied server-side. Only files starting with this prefix will +be fetched from Cloud Storage. This can be useful if all the files you want to +process are in a particular folder and want to reduce network traffic. + [id="plugins-{type}s-{plugin}-file_exclude"] ===== `file_exclude` diff --git a/lib/logstash/inputs/cloud_storage/client.rb b/lib/logstash/inputs/cloud_storage/client.rb index 38ceb5f..1c1978d 100644 --- a/lib/logstash/inputs/cloud_storage/client.rb +++ b/lib/logstash/inputs/cloud_storage/client.rb @@ -10,16 +10,27 @@ module Inputs module CloudStorage # Client provides all the required transport and authentication setup for the plugin. class Client - def initialize(bucket, json_key_path, logger) + def initialize(bucket, json_key_path, logger, blob_prefix='') @logger = logger @bucket = bucket + @blob_prefix = blob_prefix # create client @storage = initialize_storage json_key_path end + java_import 'com.google.cloud.storage.Storage' def list_blobs - @storage.list(@bucket).iterateAll().each do |blobname| + # NOTE: there is the option to filter which fields are returned by + # the call. If we find the bandwidth overhead is too much it would be + # possible (but tedious) to filter the returned fields to just those + # that this plugin uses. + filter = [] + if @blob_prefix != '' + filter = [Storage::BlobListOption.prefix(@blob_prefix)] + end + + @storage.list(@bucket, filter.to_java).iterateAll().each do |blobname| yield LogStash::Inputs::CloudStorage::BlobAdapter.new(blobname) end rescue Java::ComGoogleCloudStorage::StorageException => e diff --git a/lib/logstash/inputs/google_cloud_storage.rb b/lib/logstash/inputs/google_cloud_storage.rb index b478ce7..5053e7e 100644 --- a/lib/logstash/inputs/google_cloud_storage.rb +++ b/lib/logstash/inputs/google_cloud_storage.rb @@ -21,6 +21,7 @@ class LogStash::Inputs::GoogleCloudStorage < LogStash::Inputs::Base config :interval, :validate => :number, :default => 60 # Inclusion/Exclusion Criteria + config :file_prefix, :validate => :string, :default => '' config :file_matches, :validate => :string, :default => '.*\\.log(\\.gz)?' config :file_exclude, :validate => :string, :default => '^$' config :metadata_key, :validate => :string, :default => 'x-goog-meta-ls-gcs-input' @@ -39,7 +40,7 @@ class LogStash::Inputs::GoogleCloudStorage < LogStash::Inputs::Base def register FileUtils.mkdir_p(@temp_directory) unless Dir.exist?(@temp_directory) - @client = LogStash::Inputs::CloudStorage::Client.new(@bucket_id, @json_key_file, @logger) + @client = LogStash::Inputs::CloudStorage::Client.new(@bucket_id, @json_key_file, @logger, @file_prefix) if @processed_db_path.nil? ls_data = LogStash::SETTINGS.get_value('path.data') diff --git a/logstash-input-google_cloud_storage.gemspec b/logstash-input-google_cloud_storage.gemspec index c6442f3..510b703 100644 --- a/logstash-input-google_cloud_storage.gemspec +++ b/logstash-input-google_cloud_storage.gemspec @@ -1,6 +1,6 @@ Gem::Specification.new do |s| s.name = 'logstash-input-google_cloud_storage' - s.version = '0.11.0' + s.version = '0.12.0' s.licenses = ['Apache-2.0'] s.summary = 'Plugin to import log data from Google Cloud Storage (GCS).' s.description = 'This gem is a Logstash plugin required to be installed on top of the '\ diff --git a/spec/inputs/cloud_storage/client_spec.rb b/spec/inputs/cloud_storage/client_spec.rb index 2d39041..022b111 100644 --- a/spec/inputs/cloud_storage/client_spec.rb +++ b/spec/inputs/cloud_storage/client_spec.rb @@ -13,7 +13,7 @@ it 'does not throw an error when initializing' do key_file = ::File.join('spec', 'fixtures', 'credentials.json') - LogStash::Inputs::CloudStorage::Client.new('my-bucket', key_file, logger) + LogStash::Inputs::CloudStorage::Client.new('my-bucket', key_file, logger, 'prefix') end end end diff --git a/spec/inputs/google_cloud_storage_spec.rb b/spec/inputs/google_cloud_storage_spec.rb index ec99daa..7bc18a4 100644 --- a/spec/inputs/google_cloud_storage_spec.rb +++ b/spec/inputs/google_cloud_storage_spec.rb @@ -22,7 +22,8 @@ 'processed_db_path' => processed_db_dir, 'temp_directory' => download_dir, 'delete' => true, - 'unpack_gzip' => false + 'unpack_gzip' => false, + 'file_prefix' => '/some/prefix/here' } }