From 9823f4a1a1a25f44adf70d68c09aee43266fbbbc Mon Sep 17 00:00:00 2001 From: Minno Dang Date: Tue, 15 Oct 2024 16:17:49 +0100 Subject: [PATCH 1/8] Introduce new specialist finder index event processor and classes Make a copy of the existing govuk index classes that we would need to make changes to --- lib/specialist_finder_index/client.rb | 11 + .../document_type_mapper.rb | 29 ++ .../presenters/elasticsearch_presenter.rb | 297 ++++++++++++++++++ .../presenters/specialist_presenter.rb | 167 ++++++++++ .../publishing_event_job.rb | 137 ++++++++ .../publishing_event_processor.rb | 11 + 6 files changed, 652 insertions(+) create mode 100644 lib/specialist_finder_index/client.rb create mode 100644 lib/specialist_finder_index/document_type_mapper.rb create mode 100644 lib/specialist_finder_index/presenters/elasticsearch_presenter.rb create mode 100644 lib/specialist_finder_index/presenters/specialist_presenter.rb create mode 100644 lib/specialist_finder_index/publishing_event_job.rb create mode 100644 lib/specialist_finder_index/publishing_event_processor.rb diff --git a/lib/specialist_finder_index/client.rb b/lib/specialist_finder_index/client.rb new file mode 100644 index 000000000..c77965409 --- /dev/null +++ b/lib/specialist_finder_index/client.rb @@ -0,0 +1,11 @@ +module SpecialistFinderIndex + class Client < Index::Client + private + + def index_name + # rubocop:disable Naming/MemoizedInstanceVariableName + @_index ||= SearchConfig.govuk_index_name + # rubocop:enable Naming/MemoizedInstanceVariableName + end + end +end diff --git a/lib/specialist_finder_index/document_type_mapper.rb b/lib/specialist_finder_index/document_type_mapper.rb new file mode 100644 index 000000000..a4568edd3 --- /dev/null +++ b/lib/specialist_finder_index/document_type_mapper.rb @@ -0,0 +1,29 @@ +module SpecialistFinderIndex + class DocumentTypeMapper + UNPUBLISHING_TYPES = %w[gone redirect substitute vanish].freeze + + def initialize(payload) + @payload = payload + end + + def type + elasticsearch_document_type + end + + def unpublishing_type? + UNPUBLISHING_TYPES.include?(payload["document_type"]) + end + + private + + attr_reader :payload + + def mapped_document_types + @mapped_document_types ||= YAML.load_file(File.join(__dir__, "../../config/govuk_index/mapped_document_types.yaml")) + end + + def elasticsearch_document_type + @elasticsearch_document_type ||= mapped_document_types[payload["document_type"]] + end + end +end diff --git a/lib/specialist_finder_index/presenters/elasticsearch_presenter.rb b/lib/specialist_finder_index/presenters/elasticsearch_presenter.rb new file mode 100644 index 000000000..a15468372 --- /dev/null +++ b/lib/specialist_finder_index/presenters/elasticsearch_presenter.rb @@ -0,0 +1,297 @@ +module SpecialistFinderIndex + class ElasticsearchPresenter + include GovukIndex::ElasticsearchIdentity + + def initialize(payload:, type_mapper:) + @payload = payload + @inferred_type = type_mapper + end + + def type + @type ||= @inferred_type.type + end + + def document + { + ai_assurance_technique: specialist.ai_assurance_technique, + aircraft_category: specialist.aircraft_category, + aircraft_type: specialist.aircraft_type, + alert_type: specialist.alert_type, + algorithmic_transparency_record_atrs_version: specialist.algorithmic_transparency_record_atrs_version, + algorithmic_transparency_record_capability: specialist.algorithmic_transparency_record_capability, + algorithmic_transparency_record_date_published: specialist.algorithmic_transparency_record_date_published, + algorithmic_transparency_record_function: specialist.algorithmic_transparency_record_function, + algorithmic_transparency_record_organisation: specialist.algorithmic_transparency_record_organisation, + algorithmic_transparency_record_organisation_type: specialist.algorithmic_transparency_record_organisation_type, + algorithmic_transparency_record_other_tags: specialist.algorithmic_transparency_record_other_tags, + algorithmic_transparency_record_phase: specialist.algorithmic_transparency_record_phase, + algorithmic_transparency_record_region: specialist.algorithmic_transparency_record_region, + algorithmic_transparency_record_task: specialist.algorithmic_transparency_record_task, + areas_of_interest: specialist.areas_of_interest, + assessment_date: specialist.assessment_date, + assurance_technique_approach: specialist.assurance_technique_approach, + attachments: common_fields.attachments, + authors: specialist.authors, + business_sizes: specialist.business_sizes, + business_stages: specialist.business_stages, + case_state: specialist.case_state, + case_type: specialist.case_type, + category: specialist.category, + certificate_status: specialist.certificate_status, + class_category: specialist.class_category, + closed_date: specialist.closed_date, + closing_date: specialist.closing_date, + commodity_type: specialist.commodity_type, + contact_groups: details.contact_groups, + content_id: common_fields.content_id, + content_purpose_subgroup: common_fields.content_purpose_subgroup, + content_purpose_supergroup: common_fields.content_purpose_supergroup, + content_store_document_type: common_fields.content_store_document_type, + continuation_link: specialist.continuation_link, + country: specialist.country, + country_of_origin: specialist.country_of_origin, + date_application: specialist.date_application, + date_of_completion: specialist.date_of_completion, + date_of_occurrence: specialist.date_of_occurrence, + date_of_start: specialist.date_of_start, + date_registration: specialist.date_registration, + date_registration_eu: specialist.date_registration_eu, + decision_subject: specialist.decision_subject, + description: common_fields.description, + destination_country: specialist.destination_country, + development_sector: specialist.development_sector, + digital_market_research_area: specialist.digital_market_research_area, + digital_market_research_category: specialist.digital_market_research_category, + digital_market_research_publish_date: specialist.digital_market_research_publish_date, + digital_market_research_publisher: specialist.digital_market_research_publisher, + digital_market_research_topic: specialist.digital_market_research_topic, + disease_case_closed_date: specialist.disease_case_closed_date, + disease_case_opened_date: specialist.disease_case_opened_date, + disease_type: specialist.disease_type, + document_type: type, + eligible_entities: specialist.eligible_entities, + email_document_supertype: common_fields.email_document_supertype, + first_published_at: specialist.first_published_at, + flood_and_coastal_erosion_category: specialist.flood_and_coastal_erosion_category, + format: common_fields.format, + fund_state: specialist.fund_state, + fund_type: specialist.fund_type, + funding_amount: specialist.funding_amount, + funding_source: specialist.funding_source, + government_document_supertype: common_fields.government_document_supertype, + government_name: common_fields.government_name, + grant_type: specialist.grant_type, + hidden_indexable_content: specialist.hidden_indexable_content, + hmrc_manual_section_id: common_fields.section_id, + image_url:, + indexable_content: indexable.indexable_content, + industries: specialist.industries, + internal_notes: specialist.internal_notes, + is_historic: common_fields.historic?, + is_political: common_fields.political?, + is_withdrawn: common_fields.withdrawn?, + issued_date: specialist.issued_date, + keyword: specialist.keyword, + key_function: specialist.key_function, + laid_date: specialist.laid_date, + land_use: specialist.land_use, + land_types: specialist.land_types, + latest_change_note: details.latest_change_note, + licence_identifier: details.licence_identifier, + licence_transaction_continuation_link: specialist.licence_transaction_continuation_link, + licence_transaction_industry: specialist.licence_transaction_industry, + licence_transaction_licence_identifier: specialist.licence_transaction_licence_identifier, + licence_transaction_location: specialist.licence_transaction_location, + licence_transaction_will_continue_on: specialist.licence_transaction_will_continue_on, + licence_short_description: details.licence_short_description, + life_saving_maritime_appliance_service_station_regions: specialist.life_saving_maritime_appliance_service_station_regions, + life_saving_maritime_appliance_type: specialist.life_saving_maritime_appliance_type, + life_saving_maritime_appliance_manufacturer: specialist.life_saving_maritime_appliance_manufacturer, + link: common_fields.link, + location: specialist.location, + mainstream_browse_page_content_ids: expanded_links.mainstream_browse_page_content_ids, + mainstream_browse_pages: expanded_links.mainstream_browse_pages, + manual: details.parent_manual, + marine_notice_topic: specialist.marine_notice_topic, + marine_notice_type: specialist.marine_notice_type, + marine_notice_vessel_type: specialist.marine_notice_vessel_type, + market_sector: specialist.market_sector, + medical_specialism: specialist.medical_specialism, + opened_date: specialist.opened_date, + organisation_content_ids: expanded_links.organisation_content_ids, + organisations: expanded_links.organisations, + outcome_type: specialist.outcome_type, + part_of_taxonomy_tree: expanded_links.part_of_taxonomy_tree, + parts: parts.presented_parts, + payment_types: specialist.payment_types, + people: expanded_links.people, + policy_groups: expanded_links.policy_groups, + popularity: common_fields.popularity, + popularity_b: common_fields.popularity_b, + primary_publishing_organisation: expanded_links.primary_publishing_organisation, + principle: specialist.principle, + product_alert_type: specialist.product_alert_type, + product_category: specialist.product_category, + product_measure_type: specialist.product_measure_type, + product_recall_alert_date: specialist.product_recall_alert_date, + product_risk_level: specialist.product_risk_level, + project_code: specialist.project_code, + project_status: specialist.project_status, + protection_type: specialist.protection_type, + public_timestamp: common_fields.public_timestamp, + publishing_app: common_fields.publishing_app, + railway_type: specialist.railway_type, + reason_for_protection: specialist.reason_for_protection, + reference_number: specialist.reference_number, + regions: specialist.regions, + register: specialist.register, + registered_name: specialist.registered_name, + registration: specialist.registration, + rendering_app: common_fields.rendering_app, + report_type: specialist.report_type, + research_document_type: specialist.research_document_type, + result: specialist.result, + review_status: specialist.review_status, + role_appointments: expanded_links.role_appointments, + roles: expanded_links.roles, + sector: specialist.sector, + service_provider: specialist.service_provider, + sift_end_date: specialist.sift_end_date, + sifting_status: specialist.sifting_status, + slug:, + stage: specialist.stage, + status: specialist.status, + subject: specialist.subject, + taxons: expanded_links.taxons, + theme: specialist.theme, + therapeutic_area: specialist.therapeutic_area, + tiers_or_standalone_items: specialist.tiers_or_standalone_items, + time_registration: specialist.time_registration, + title: common_fields.title, + topical_events: expanded_links.topical_events, + topics: specialist.topics, + traditional_term_grapevine_product_category: specialist.traditional_term_grapevine_product_category, + traditional_term_language: specialist.traditional_term_language, + traditional_term_type: specialist.traditional_term_type, + tribunal_decision_categories: specialist.tribunal_decision_categories, + tribunal_decision_category: specialist.tribunal_decision_category, + tribunal_decision_country: specialist.tribunal_decision_country, + tribunal_decision_decision_date: specialist.tribunal_decision_decision_date, + tribunal_decision_judges: specialist.tribunal_decision_judges, + tribunal_decision_landmark: specialist.tribunal_decision_landmark, + tribunal_decision_reference_number: specialist.tribunal_decision_reference_number, + tribunal_decision_sub_categories: specialist.tribunal_decision_sub_categories, + tribunal_decision_sub_category: specialist.tribunal_decision_sub_category, + types_of_support: specialist.types_of_support, + updated_at: common_fields.updated_at, + use_case: specialist.use_case, + user_journey_document_supertype: common_fields.user_journey_document_supertype, + value_of_funding: specialist.value_of_funding, + vessel_type: specialist.vessel_type, + veterans_support_organisation_health_and_social_care: specialist.veterans_support_organisation_health_and_social_care, + veterans_support_organisation_finance: specialist.veterans_support_organisation_finance, + veterans_support_organisation_legal_and_justice: specialist.veterans_support_organisation_legal_and_justice, + veterans_support_organisation_employment_education_and_training: specialist.veterans_support_organisation_employment_education_and_training, + veterans_support_organisation_housing: specialist.veterans_support_organisation_housing, + veterans_support_organisation_families_and_children: specialist.veterans_support_organisation_families_and_children, + veterans_support_organisation_community_and_social: specialist.veterans_support_organisation_community_and_social, + veterans_support_organisation_region_england: specialist.veterans_support_organisation_region_england, + veterans_support_organisation_region_northern_ireland: specialist.veterans_support_organisation_region_northern_ireland, + veterans_support_organisation_region_scotland: specialist.veterans_support_organisation_region_scotland, + veterans_support_organisation_region_wales: specialist.veterans_support_organisation_region_wales, + view_count: common_fields.view_count, + virus_strain: specialist.virus_strain, + will_continue_on: specialist.will_continue_on, + withdrawn_date: specialist.withdrawn_date, + world_locations: expanded_links.world_locations, + year_adopted: specialist.year_adopted, + zone_restriction: specialist.zone_restriction, + zone_type: specialist.zone_type, + }.reject { |_, v| v.nil? } + end + + def updated_at + common_fields.updated_at + end + + def format + common_fields.format + end + + def base_path + common_fields.base_path + end + + def link + common_fields.link + end + + def publishing_app + common_fields.publishing_app + end + + def valid! + if format == "recommended-link" + details.url || raise(MissingExternalUrl, "url missing from details section") + else + base_path || raise(NotIdentifiable, "base_path missing from payload") + end + end + + def image_url + details.image_url || (expanded_links.default_news_image if newslike?) + end + + private + + attr_reader :payload + + def indexable + GovukIndex::IndexableContentPresenter.new( + format: common_fields.format, + details: payload["details"], + sanitiser: GovukIndex::IndexableContentSanitiser.new, + ) + end + + def slug + case format + when "mainstream_browse_page" + base_path.gsub(%r{^/browse/}, "") + when "policy" + base_path.gsub(%r{^/government/policies/}, "") + when "person" + base_path.gsub(%r{^/government/people/}, "") + when "ministerial_role" + base_path.gsub(%r{^/government/ministers/}, "") + end + end + + def common_fields + @common_fields ||= GovukIndex::CommonFieldsPresenter.new(payload) + end + + def details + @details ||= GovukIndex::DetailsPresenter.new(details: payload["details"], format: common_fields.format) + end + + def parts + @parts ||= GovukIndex::PartsPresenter.new(parts: payload["details"].fetch("parts", [])) + end + + def expanded_links + @expanded_links ||= GovukIndex::ExpandedLinksPresenter.new(payload["expanded_links"]) + end + + def specialist + @specialist ||= SpecialistPresenter.new(payload) + end + + def newslike? + return false if common_fields.content_store_document_type == "fatality_notice" + + common_fields.content_purpose_subgroup == "news" || + common_fields.content_purpose_subgroup == "speeches_and_statements" + end + end +end diff --git a/lib/specialist_finder_index/presenters/specialist_presenter.rb b/lib/specialist_finder_index/presenters/specialist_presenter.rb new file mode 100644 index 000000000..b5d6b0ed9 --- /dev/null +++ b/lib/specialist_finder_index/presenters/specialist_presenter.rb @@ -0,0 +1,167 @@ +module SpecialistFinderIndex + class SpecialistPresenter + extend GovukIndex::MethodBuilder + + set_payload_method :metadata + + delegate_to_payload :ai_assurance_technique, convert_to_array: true + delegate_to_payload :aircraft_category + delegate_to_payload :aircraft_type + delegate_to_payload :alert_type, convert_to_array: true + delegate_to_payload :algorithmic_transparency_record_atrs_version + delegate_to_payload :algorithmic_transparency_record_capability, convert_to_array: true + delegate_to_payload :algorithmic_transparency_record_date_published + delegate_to_payload :algorithmic_transparency_record_function, convert_to_array: true + delegate_to_payload :algorithmic_transparency_record_organisation + delegate_to_payload :algorithmic_transparency_record_organisation_type, convert_to_array: true + delegate_to_payload :algorithmic_transparency_record_other_tags + delegate_to_payload :algorithmic_transparency_record_phase + delegate_to_payload :algorithmic_transparency_record_region, convert_to_array: true + delegate_to_payload :algorithmic_transparency_record_task + delegate_to_payload :areas_of_interest + delegate_to_payload :assessment_date + delegate_to_payload :assurance_technique_approach, convert_to_array: true + delegate_to_payload :authors + delegate_to_payload :business_sizes + delegate_to_payload :business_stages + delegate_to_payload :category, convert_to_array: true + delegate_to_payload :case_state, convert_to_array: true + delegate_to_payload :case_type, convert_to_array: true + delegate_to_payload :certificate_status + delegate_to_payload :class_category + delegate_to_payload :closed_date + delegate_to_payload :closing_date + delegate_to_payload :commodity_type + delegate_to_payload :continuation_link + delegate_to_payload :country + delegate_to_payload :country_of_origin + delegate_to_payload :date_application + delegate_to_payload :date_of_completion + delegate_to_payload :date_of_occurrence + delegate_to_payload :date_of_start + delegate_to_payload :date_registration + delegate_to_payload :date_registration_eu + delegate_to_payload :decision_subject + delegate_to_payload :destination_country, convert_to_array: true + delegate_to_payload :development_sector + delegate_to_payload :digital_market_research_area, convert_to_array: true + delegate_to_payload :digital_market_research_category + delegate_to_payload :digital_market_research_publish_date + delegate_to_payload :digital_market_research_publisher, convert_to_array: true + delegate_to_payload :digital_market_research_topic, convert_to_array: true + delegate_to_payload :disease_case_closed_date + delegate_to_payload :disease_case_opened_date + delegate_to_payload :disease_type, convert_to_array: true + delegate_to_payload :eligible_entities + delegate_to_payload :flood_and_coastal_erosion_category + delegate_to_payload :fund_state, convert_to_array: true + delegate_to_payload :fund_type + delegate_to_payload :funding_amount + delegate_to_payload :funding_source + delegate_to_payload :grant_type, convert_to_array: true + delegate_to_payload :hidden_indexable_content + delegate_to_payload :industries + delegate_to_payload :internal_notes + delegate_to_payload :issued_date + delegate_to_payload :key_function, convert_to_array: true + delegate_to_payload :keyword + delegate_to_payload :laid_date + delegate_to_payload :land_types + delegate_to_payload :land_use + delegate_to_payload :licence_transaction_continuation_link + delegate_to_payload :licence_transaction_industry, convert_to_array: true + delegate_to_payload :licence_transaction_licence_identifier + delegate_to_payload :licence_transaction_location, convert_to_array: true + delegate_to_payload :licence_transaction_will_continue_on + delegate_to_payload :life_saving_maritime_appliance_service_station_regions, convert_to_array: true + delegate_to_payload :life_saving_maritime_appliance_type, convert_to_array: true + delegate_to_payload :life_saving_maritime_appliance_manufacturer, convert_to_array: true + delegate_to_payload :location, convert_to_array: true + delegate_to_payload :marine_notice_topic + delegate_to_payload :marine_notice_type + delegate_to_payload :marine_notice_vessel_type + delegate_to_payload :market_sector + delegate_to_payload :medical_specialism + delegate_to_payload :opened_date + delegate_to_payload :outcome_type + delegate_to_payload :payment_types + delegate_to_payload :principle, convert_to_array: true + delegate_to_payload :product_alert_type + delegate_to_payload :product_category + delegate_to_payload :product_measure_type + delegate_to_payload :product_recall_alert_date + delegate_to_payload :product_risk_level + delegate_to_payload :project_code + delegate_to_payload :project_status + delegate_to_payload :protection_type + delegate_to_payload :railway_type + delegate_to_payload :reason_for_protection + delegate_to_payload :reference_number + delegate_to_payload :regions + delegate_to_payload :register + delegate_to_payload :registered_name + delegate_to_payload :registration + delegate_to_payload :report_type, convert_to_array: true + delegate_to_payload :research_document_type + delegate_to_payload :result + delegate_to_payload :review_status + delegate_to_payload :sector, convert_to_array: true + delegate_to_payload :service_provider + delegate_to_payload :sift_end_date + delegate_to_payload :sifting_status + delegate_to_payload :stage + delegate_to_payload :status + delegate_to_payload :subject + delegate_to_payload :theme + delegate_to_payload :therapeutic_area + delegate_to_payload :tiers_or_standalone_items + delegate_to_payload :time_registration + delegate_to_payload :topics + delegate_to_payload :traditional_term_grapevine_product_category + delegate_to_payload :traditional_term_language + delegate_to_payload :traditional_term_type + delegate_to_payload :tribunal_decision_categories + delegate_to_payload :tribunal_decision_category + delegate_to_payload :tribunal_decision_country + delegate_to_payload :tribunal_decision_decision_date + delegate_to_payload :tribunal_decision_judges + delegate_to_payload :tribunal_decision_landmark + delegate_to_payload :tribunal_decision_reference_number + delegate_to_payload :tribunal_decision_sub_categories + delegate_to_payload :tribunal_decision_sub_category + delegate_to_payload :types_of_support + delegate_to_payload :use_case, convert_to_array: true + delegate_to_payload :value_of_funding + delegate_to_payload :vessel_type + delegate_to_payload :veterans_support_organisation_health_and_social_care + delegate_to_payload :veterans_support_organisation_finance + delegate_to_payload :veterans_support_organisation_legal_and_justice + delegate_to_payload :veterans_support_organisation_employment_education_and_training + delegate_to_payload :veterans_support_organisation_housing + delegate_to_payload :veterans_support_organisation_families_and_children + delegate_to_payload :veterans_support_organisation_community_and_social + delegate_to_payload :veterans_support_organisation_region_england + delegate_to_payload :veterans_support_organisation_region_northern_ireland + delegate_to_payload :veterans_support_organisation_region_scotland + delegate_to_payload :veterans_support_organisation_region_wales + delegate_to_payload :virus_strain + delegate_to_payload :will_continue_on + delegate_to_payload :withdrawn_date + delegate_to_payload :year_adopted + delegate_to_payload :zone_restriction + delegate_to_payload :zone_type, convert_to_array: true + + def initialize(payload) + @payload = payload + @metadata = @payload.dig("details", "metadata") || {} + end + + def first_published_at + metadata["first_published_at"] || @payload["first_published_at"] + end + + private + + attr_reader :metadata + end +end diff --git a/lib/specialist_finder_index/publishing_event_job.rb b/lib/specialist_finder_index/publishing_event_job.rb new file mode 100644 index 000000000..ff74f90b4 --- /dev/null +++ b/lib/specialist_finder_index/publishing_event_job.rb @@ -0,0 +1,137 @@ +module SpecialistFinderIndex + class ElasticsearchRetryError < StandardError; end + + class ElasticsearchInvalidResponseItemCount < StandardError; end + + class MissingTextHtmlContentType < StandardError; end + + class MultipleMessagesInElasticsearchResponse < StandardError; end + + class NotFoundError < StandardError; end + + class UnknownDocumentTypeError < StandardError; end + + class NotIdentifiable < StandardError; end + + class MissingExternalUrl < StandardError; end + + DOCUMENT_TYPES_WITHOUT_BASE_PATH = + %w[ + contact + role_appointment + world_location + + # + role + document + types + ambassador_role + board_member_role + chief_professional_officer_role + chief_scientific_officer_role + chief_scientific_advisor_role + deputy_head_of_mission_role + governor_role + high_commissioner_role + military_role + ministerial_role + special_representative_role + traffic_commissioner_role + worldwide_office_staff_role + ].freeze + + class PublishingEventJob < BaseJob + notify_of_failures + + def perform(messages) + processor = Index::ElasticsearchProcessor.govuk + + messages.each do |routing_key, payload| + process_action(processor, routing_key, payload) + end + + responses = processor.commit + + (responses || []).each do |response| + process_response(response, messages) + end + # Rescuing exception to guarantee we capture all Sidekiq retries + rescue Exception # rubocop:disable Lint/RescueException + Services.statsd_client.increment("govuk_index.sidekiq-retry") + raise + end + + private + + def process_action(processor, routing_key, payload) + logger.debug("Processing #{routing_key}: #{payload}") + Services.statsd_client.increment("govuk_index.sidekiq-consumed") + + type_mapper = DocumentTypeMapper.new(payload) + + presenter = if type_mapper.unpublishing_type? + GovukIndex::ElasticsearchDeletePresenter.new(payload:) + else + ElasticsearchPresenter.new( + payload: GovukIndex::PayloadPreparer.new(payload).prepare, + type_mapper:, + ) + end + + presenter.valid! + + identifier = "#{presenter.link} #{presenter.type || "'unmapped type'"}" + + if type_mapper.unpublishing_type? + logger.info("#{routing_key} -> DELETE #{identifier}") + processor.delete(presenter) + elsif payload.fetch("locale", "en") != "en" || GovukIndex::MigratedFormats.non_indexable?(presenter.format, presenter.base_path, presenter.publishing_app) + logger.info("#{routing_key} -> BLOCKLISTED #{identifier}") + elsif GovukIndex::MigratedFormats.indexable?(presenter.format, presenter.base_path, presenter.publishing_app) + logger.info("#{routing_key} -> INDEX #{identifier}") + processor.save(presenter) + else + logger.info("#{routing_key} -> UNKNOWN #{identifier}") + end + + # Rescuing as we don't want to retry this class of error + rescue NotIdentifiable => e + return if DOCUMENT_TYPES_WITHOUT_BASE_PATH.include?(payload["document_type"]) + + GovukError.notify(e, extra: { message_body: payload }) + # Unpublishing messages for something that does not exist may have been + # processed out of order so we don't want to notify errbit but just allow + # the process to continue + rescue NotFoundError + logger.info("#{payload['base_path']} could not be found.") + Services.statsd_client.increment("govuk_index.not-found-error") + rescue UnknownDocumentTypeError + logger.info("#{payload['document_type']} document type is not known.") + Services.statsd_client.increment("govuk_index.unknown-document-type") + end + + def process_response(response, messages) + messages_with_error = [] + if response["items"].count > 1 + Services.statsd_client.increment("govuk_index.elasticsearch.multiple_responses") + end + + if response["items"].count != messages.count + raise ElasticsearchInvalidResponseItemCount, "received #{response['items'].count} expected #{messages.count}" + end + + response["items"].zip(messages).each do |response_for_message, message| + messages_with_error << message unless Index::ResponseValidator.new(namespace: "govuk_index").valid?(response_for_message) + end + + if messages_with_error.any? + # raise an error so that all messages are retried. + # NOTE: versioned ES actions can be performed multiple with a consistent result. + raise ElasticsearchRetryError.new( + reason: "Elasticsearch failures", + messages: "#{messages_with_error.count} of #{messages.count} failed - see ElasticsearchError's for details", + ) + end + end + end +end diff --git a/lib/specialist_finder_index/publishing_event_processor.rb b/lib/specialist_finder_index/publishing_event_processor.rb new file mode 100644 index 000000000..73f3fb385 --- /dev/null +++ b/lib/specialist_finder_index/publishing_event_processor.rb @@ -0,0 +1,11 @@ +module SpecialistFinderIndex + class PublishingEventProcessor + def process(messages) + messages = Array(messages) # treat a single message as an array with one value + + Services.statsd_client.increment("govuk_index.rabbit-mq-consumed") + PublishingEventJob.perform_async(messages.map { |msg| [msg.delivery_info[:routing_key], msg.payload] }) + messages.each(&:ack) + end + end +end From baa562f504e02498dfd9fd9cc0acc5420cdd9680 Mon Sep 17 00:00:00 2001 From: Minno Dang Date: Tue, 15 Oct 2024 16:35:27 +0100 Subject: [PATCH 2/8] Amend Specialist Finder index files to customise for specialist document cases - add new index schema - No longer need to map document types from publishing API - directly use publishing API document type, as most specialist document types are a direct 1:1 mapping - No need to check for migrated data, as all specialist documents are new and do not need to be migrated from old indices - Remove any fields that are related to other document specific, such as news or ministerial roles --- config/schema/indexes/specialist-finder.json | 48 +++++++ elasticsearch.yml | 2 + lib/index/elasticsearch_processor.rb | 4 + lib/search_config.rb | 1 + lib/specialist_finder_index/client.rb | 2 +- .../document_type_mapper.rb | 16 +-- .../presenters/elasticsearch_presenter.rb | 124 ++++++++---------- .../publishing_event_job.rb | 55 +++----- .../publishing_event_processor.rb | 2 +- 9 files changed, 126 insertions(+), 128 deletions(-) create mode 100644 config/schema/indexes/specialist-finder.json diff --git a/config/schema/indexes/specialist-finder.json b/config/schema/indexes/specialist-finder.json new file mode 100644 index 000000000..471c6c73f --- /dev/null +++ b/config/schema/indexes/specialist-finder.json @@ -0,0 +1,48 @@ +{ + "elasticsearch_types": [ + "aaib_report", + "ai_assurance_portfolio_technique", + "algorithmic_transparency_record", + "animal_disease_case", + "asylum_support_decision", + "business_finance_support_scheme", + "cma_case", + "contact", + "countryside_stewardship_grant", + "drcf_digital_markets_research", + "drug_safety_update", + "edition", + "employment_appeal_tribunal_decision", + "employment_tribunal_decision", + "european_structural_investment_fund", + "export_health_certificate", + "farming_grant", + "flood_and_coastal_erosion_risk_management_research_report", + "hmrc_manual", + "hmrc_manual_section", + "international_development_fund", + "licence_transaction", + "life_saving_maritime_appliance_service_station", + "maib_report", + "manual", + "manual_section", + "marine_equipment_approved_recommendation", + "marine_notice", + "medical_safety_alert", + "person", + "policy", + "product_safety_alert_report_recall", + "protected_food_drink_name", + "raib_report", + "research_for_development_output", + "residential_property_tribunal_decision", + "traffic_commissioner_regulatory_decision", + "service_manual_guide", + "service_manual_topic", + "service_standard_report", + "statutory_instrument", + "tax_tribunal_decision", + "utaac_decision", + "veterans_support_organisation" + ] +} diff --git a/elasticsearch.yml b/elasticsearch.yml index d83dcd193..5e817aedc 100644 --- a/elasticsearch.yml +++ b/elasticsearch.yml @@ -2,6 +2,7 @@ production: &default base_uri: <%= ENV["ELASTICSEARCH_URI"] || 'http://localhost:9200' %> content_index_names: ["detailed", "government"] govuk_index_name: "govuk" + specialist_finder_index_name: "specialist-finder" auxiliary_index_names: ["page-traffic", "metasearch"] registry_index: "government" metasearch_index_name: "metasearch" @@ -23,6 +24,7 @@ test: base_uri: <%= ENV.fetch('ELASTICSEARCH_URI', 'http://localhost:9200') %> content_index_names: ["government_test"] govuk_index_name: "govuk_test" + specialist_finder_index_name: "specialist-finder_test" auxiliary_index_names: ["page-traffic_test", "metasearch_test"] registry_index: "government_test" metasearch_index_name: "metasearch_test" diff --git a/lib/index/elasticsearch_processor.rb b/lib/index/elasticsearch_processor.rb index 0183231ba..e0ef36f72 100644 --- a/lib/index/elasticsearch_processor.rb +++ b/lib/index/elasticsearch_processor.rb @@ -8,6 +8,10 @@ def self.govuk new(client: GovukIndex::Client) end + def self.specialist_finder + new(client: SpecialistFinderIndex::Client) + end + def initialize(client:) @client = client @actions = [] diff --git a/lib/search_config.rb b/lib/search_config.rb index d2828663d..ebe2e65f4 100644 --- a/lib/search_config.rb +++ b/lib/search_config.rb @@ -10,6 +10,7 @@ class << self content_index_names spelling_index_names govuk_index_name + specialist_finder_index_name page_traffic_index_name ].each do |config_method| define_method config_method do diff --git a/lib/specialist_finder_index/client.rb b/lib/specialist_finder_index/client.rb index c77965409..af2df0464 100644 --- a/lib/specialist_finder_index/client.rb +++ b/lib/specialist_finder_index/client.rb @@ -4,7 +4,7 @@ class Client < Index::Client def index_name # rubocop:disable Naming/MemoizedInstanceVariableName - @_index ||= SearchConfig.govuk_index_name + @_index ||= SearchConfig.specialist_finder_index_name # rubocop:enable Naming/MemoizedInstanceVariableName end end diff --git a/lib/specialist_finder_index/document_type_mapper.rb b/lib/specialist_finder_index/document_type_mapper.rb index a4568edd3..98ea3cd71 100644 --- a/lib/specialist_finder_index/document_type_mapper.rb +++ b/lib/specialist_finder_index/document_type_mapper.rb @@ -7,23 +7,11 @@ def initialize(payload) end def type - elasticsearch_document_type + @payload["document_type"] end def unpublishing_type? - UNPUBLISHING_TYPES.include?(payload["document_type"]) - end - - private - - attr_reader :payload - - def mapped_document_types - @mapped_document_types ||= YAML.load_file(File.join(__dir__, "../../config/govuk_index/mapped_document_types.yaml")) - end - - def elasticsearch_document_type - @elasticsearch_document_type ||= mapped_document_types[payload["document_type"]] + UNPUBLISHING_TYPES.include?(@payload["document_type"]) end end end diff --git a/lib/specialist_finder_index/presenters/elasticsearch_presenter.rb b/lib/specialist_finder_index/presenters/elasticsearch_presenter.rb index a15468372..002626afc 100644 --- a/lib/specialist_finder_index/presenters/elasticsearch_presenter.rb +++ b/lib/specialist_finder_index/presenters/elasticsearch_presenter.rb @@ -12,6 +12,57 @@ def type end def document + { + attachments: common_fields.attachments, + contact_groups: details.contact_groups, + content_id: common_fields.content_id, + content_purpose_subgroup: common_fields.content_purpose_subgroup, + content_purpose_supergroup: common_fields.content_purpose_supergroup, + content_store_document_type: common_fields.content_store_document_type, + description: common_fields.description, + document_type: type, + email_document_supertype: common_fields.email_document_supertype, + format: common_fields.format, + government_document_supertype: common_fields.government_document_supertype, + government_name: common_fields.government_name, + hmrc_manual_section_id: common_fields.section_id, + image_url: details.image_url, + indexable_content: indexable.indexable_content, + is_historic: common_fields.historic?, + is_political: common_fields.political?, + is_withdrawn: common_fields.withdrawn?, + latest_change_note: details.latest_change_note, + licence_identifier: details.licence_identifier, + licence_short_description: details.licence_short_description, + link: common_fields.link, + mainstream_browse_page_content_ids: expanded_links.mainstream_browse_page_content_ids, + mainstream_browse_pages: expanded_links.mainstream_browse_pages, + manual: details.parent_manual, + organisation_content_ids: expanded_links.organisation_content_ids, + organisations: expanded_links.organisations, + part_of_taxonomy_tree: expanded_links.part_of_taxonomy_tree, + parts: parts.presented_parts, + people: expanded_links.people, + policy_groups: expanded_links.policy_groups, + popularity: common_fields.popularity, + popularity_b: common_fields.popularity_b, + primary_publishing_organisation: expanded_links.primary_publishing_organisation, + public_timestamp: common_fields.public_timestamp, + publishing_app: common_fields.publishing_app, + rendering_app: common_fields.rendering_app, + role_appointments: expanded_links.role_appointments, + roles: expanded_links.roles, + taxons: expanded_links.taxons, + title: common_fields.title, + topical_events: expanded_links.topical_events, + updated_at: common_fields.updated_at, + user_journey_document_supertype: common_fields.user_journey_document_supertype, + view_count: common_fields.view_count, + world_locations: expanded_links.world_locations, + }.merge(specialist_fields).reject { |_, v| v.nil? } + end + + def specialist_fields { ai_assurance_technique: specialist.ai_assurance_technique, aircraft_category: specialist.aircraft_category, @@ -30,7 +81,6 @@ def document areas_of_interest: specialist.areas_of_interest, assessment_date: specialist.assessment_date, assurance_technique_approach: specialist.assurance_technique_approach, - attachments: common_fields.attachments, authors: specialist.authors, business_sizes: specialist.business_sizes, business_stages: specialist.business_stages, @@ -42,11 +92,6 @@ def document closed_date: specialist.closed_date, closing_date: specialist.closing_date, commodity_type: specialist.commodity_type, - contact_groups: details.contact_groups, - content_id: common_fields.content_id, - content_purpose_subgroup: common_fields.content_purpose_subgroup, - content_purpose_supergroup: common_fields.content_purpose_supergroup, - content_store_document_type: common_fields.content_store_document_type, continuation_link: specialist.continuation_link, country: specialist.country, country_of_origin: specialist.country_of_origin, @@ -57,7 +102,6 @@ def document date_registration: specialist.date_registration, date_registration_eu: specialist.date_registration_eu, decision_subject: specialist.decision_subject, - description: common_fields.description, destination_country: specialist.destination_country, development_sector: specialist.development_sector, digital_market_research_area: specialist.digital_market_research_area, @@ -68,67 +112,40 @@ def document disease_case_closed_date: specialist.disease_case_closed_date, disease_case_opened_date: specialist.disease_case_opened_date, disease_type: specialist.disease_type, - document_type: type, eligible_entities: specialist.eligible_entities, - email_document_supertype: common_fields.email_document_supertype, first_published_at: specialist.first_published_at, flood_and_coastal_erosion_category: specialist.flood_and_coastal_erosion_category, - format: common_fields.format, fund_state: specialist.fund_state, fund_type: specialist.fund_type, funding_amount: specialist.funding_amount, funding_source: specialist.funding_source, - government_document_supertype: common_fields.government_document_supertype, - government_name: common_fields.government_name, grant_type: specialist.grant_type, hidden_indexable_content: specialist.hidden_indexable_content, - hmrc_manual_section_id: common_fields.section_id, - image_url:, - indexable_content: indexable.indexable_content, industries: specialist.industries, internal_notes: specialist.internal_notes, - is_historic: common_fields.historic?, - is_political: common_fields.political?, - is_withdrawn: common_fields.withdrawn?, issued_date: specialist.issued_date, keyword: specialist.keyword, key_function: specialist.key_function, laid_date: specialist.laid_date, land_use: specialist.land_use, land_types: specialist.land_types, - latest_change_note: details.latest_change_note, - licence_identifier: details.licence_identifier, licence_transaction_continuation_link: specialist.licence_transaction_continuation_link, licence_transaction_industry: specialist.licence_transaction_industry, licence_transaction_licence_identifier: specialist.licence_transaction_licence_identifier, licence_transaction_location: specialist.licence_transaction_location, licence_transaction_will_continue_on: specialist.licence_transaction_will_continue_on, - licence_short_description: details.licence_short_description, life_saving_maritime_appliance_service_station_regions: specialist.life_saving_maritime_appliance_service_station_regions, life_saving_maritime_appliance_type: specialist.life_saving_maritime_appliance_type, life_saving_maritime_appliance_manufacturer: specialist.life_saving_maritime_appliance_manufacturer, - link: common_fields.link, location: specialist.location, - mainstream_browse_page_content_ids: expanded_links.mainstream_browse_page_content_ids, - mainstream_browse_pages: expanded_links.mainstream_browse_pages, - manual: details.parent_manual, marine_notice_topic: specialist.marine_notice_topic, marine_notice_type: specialist.marine_notice_type, marine_notice_vessel_type: specialist.marine_notice_vessel_type, market_sector: specialist.market_sector, medical_specialism: specialist.medical_specialism, opened_date: specialist.opened_date, - organisation_content_ids: expanded_links.organisation_content_ids, - organisations: expanded_links.organisations, outcome_type: specialist.outcome_type, - part_of_taxonomy_tree: expanded_links.part_of_taxonomy_tree, - parts: parts.presented_parts, payment_types: specialist.payment_types, - people: expanded_links.people, - policy_groups: expanded_links.policy_groups, - popularity: common_fields.popularity, - popularity_b: common_fields.popularity_b, - primary_publishing_organisation: expanded_links.primary_publishing_organisation, principle: specialist.principle, product_alert_type: specialist.product_alert_type, product_category: specialist.product_category, @@ -138,8 +155,6 @@ def document project_code: specialist.project_code, project_status: specialist.project_status, protection_type: specialist.protection_type, - public_timestamp: common_fields.public_timestamp, - publishing_app: common_fields.publishing_app, railway_type: specialist.railway_type, reason_for_protection: specialist.reason_for_protection, reference_number: specialist.reference_number, @@ -147,28 +162,21 @@ def document register: specialist.register, registered_name: specialist.registered_name, registration: specialist.registration, - rendering_app: common_fields.rendering_app, report_type: specialist.report_type, research_document_type: specialist.research_document_type, result: specialist.result, review_status: specialist.review_status, - role_appointments: expanded_links.role_appointments, - roles: expanded_links.roles, sector: specialist.sector, service_provider: specialist.service_provider, sift_end_date: specialist.sift_end_date, sifting_status: specialist.sifting_status, - slug:, stage: specialist.stage, status: specialist.status, subject: specialist.subject, - taxons: expanded_links.taxons, theme: specialist.theme, therapeutic_area: specialist.therapeutic_area, tiers_or_standalone_items: specialist.tiers_or_standalone_items, time_registration: specialist.time_registration, - title: common_fields.title, - topical_events: expanded_links.topical_events, topics: specialist.topics, traditional_term_grapevine_product_category: specialist.traditional_term_grapevine_product_category, traditional_term_language: specialist.traditional_term_language, @@ -183,9 +191,7 @@ def document tribunal_decision_sub_categories: specialist.tribunal_decision_sub_categories, tribunal_decision_sub_category: specialist.tribunal_decision_sub_category, types_of_support: specialist.types_of_support, - updated_at: common_fields.updated_at, use_case: specialist.use_case, - user_journey_document_supertype: common_fields.user_journey_document_supertype, value_of_funding: specialist.value_of_funding, vessel_type: specialist.vessel_type, veterans_support_organisation_health_and_social_care: specialist.veterans_support_organisation_health_and_social_care, @@ -199,15 +205,13 @@ def document veterans_support_organisation_region_northern_ireland: specialist.veterans_support_organisation_region_northern_ireland, veterans_support_organisation_region_scotland: specialist.veterans_support_organisation_region_scotland, veterans_support_organisation_region_wales: specialist.veterans_support_organisation_region_wales, - view_count: common_fields.view_count, virus_strain: specialist.virus_strain, will_continue_on: specialist.will_continue_on, withdrawn_date: specialist.withdrawn_date, - world_locations: expanded_links.world_locations, year_adopted: specialist.year_adopted, zone_restriction: specialist.zone_restriction, zone_type: specialist.zone_type, - }.reject { |_, v| v.nil? } + } end def updated_at @@ -238,10 +242,6 @@ def valid! end end - def image_url - details.image_url || (expanded_links.default_news_image if newslike?) - end - private attr_reader :payload @@ -254,19 +254,6 @@ def indexable ) end - def slug - case format - when "mainstream_browse_page" - base_path.gsub(%r{^/browse/}, "") - when "policy" - base_path.gsub(%r{^/government/policies/}, "") - when "person" - base_path.gsub(%r{^/government/people/}, "") - when "ministerial_role" - base_path.gsub(%r{^/government/ministers/}, "") - end - end - def common_fields @common_fields ||= GovukIndex::CommonFieldsPresenter.new(payload) end @@ -286,12 +273,5 @@ def expanded_links def specialist @specialist ||= SpecialistPresenter.new(payload) end - - def newslike? - return false if common_fields.content_store_document_type == "fatality_notice" - - common_fields.content_purpose_subgroup == "news" || - common_fields.content_purpose_subgroup == "speeches_and_statements" - end end end diff --git a/lib/specialist_finder_index/publishing_event_job.rb b/lib/specialist_finder_index/publishing_event_job.rb index ff74f90b4..2e2f26b8b 100644 --- a/lib/specialist_finder_index/publishing_event_job.rb +++ b/lib/specialist_finder_index/publishing_event_job.rb @@ -15,36 +15,11 @@ class NotIdentifiable < StandardError; end class MissingExternalUrl < StandardError; end - DOCUMENT_TYPES_WITHOUT_BASE_PATH = - %w[ - contact - role_appointment - world_location - - # - role - document - types - ambassador_role - board_member_role - chief_professional_officer_role - chief_scientific_officer_role - chief_scientific_advisor_role - deputy_head_of_mission_role - governor_role - high_commissioner_role - military_role - ministerial_role - special_representative_role - traffic_commissioner_role - worldwide_office_staff_role - ].freeze - class PublishingEventJob < BaseJob notify_of_failures def perform(messages) - processor = Index::ElasticsearchProcessor.govuk + processor = Index::ElasticsearchProcessor.specialist_finder messages.each do |routing_key, payload| process_action(processor, routing_key, payload) @@ -57,15 +32,19 @@ def perform(messages) end # Rescuing exception to guarantee we capture all Sidekiq retries rescue Exception # rubocop:disable Lint/RescueException - Services.statsd_client.increment("govuk_index.sidekiq-retry") + Services.statsd_client.increment("specialist_finder_index.sidekiq-retry") raise end private + NON_INDEXED_PAGES = %w[ + finder_email_signup + ].freeze + def process_action(processor, routing_key, payload) logger.debug("Processing #{routing_key}: #{payload}") - Services.statsd_client.increment("govuk_index.sidekiq-consumed") + Services.statsd_client.increment("specialist_finder_index.sidekiq-consumed") type_mapper = DocumentTypeMapper.new(payload) @@ -82,38 +61,34 @@ def process_action(processor, routing_key, payload) identifier = "#{presenter.link} #{presenter.type || "'unmapped type'"}" - if type_mapper.unpublishing_type? + if NON_INDEXED_PAGES.include? type_mapper.type + logger.info("#{routing_key} -> IGNORE #{identifier}") + elsif type_mapper.unpublishing_type? logger.info("#{routing_key} -> DELETE #{identifier}") processor.delete(presenter) - elsif payload.fetch("locale", "en") != "en" || GovukIndex::MigratedFormats.non_indexable?(presenter.format, presenter.base_path, presenter.publishing_app) - logger.info("#{routing_key} -> BLOCKLISTED #{identifier}") - elsif GovukIndex::MigratedFormats.indexable?(presenter.format, presenter.base_path, presenter.publishing_app) + else logger.info("#{routing_key} -> INDEX #{identifier}") processor.save(presenter) - else - logger.info("#{routing_key} -> UNKNOWN #{identifier}") end # Rescuing as we don't want to retry this class of error rescue NotIdentifiable => e - return if DOCUMENT_TYPES_WITHOUT_BASE_PATH.include?(payload["document_type"]) - GovukError.notify(e, extra: { message_body: payload }) # Unpublishing messages for something that does not exist may have been # processed out of order so we don't want to notify errbit but just allow # the process to continue rescue NotFoundError logger.info("#{payload['base_path']} could not be found.") - Services.statsd_client.increment("govuk_index.not-found-error") + Services.statsd_client.increment("specialist_finder_index.not-found-error") rescue UnknownDocumentTypeError logger.info("#{payload['document_type']} document type is not known.") - Services.statsd_client.increment("govuk_index.unknown-document-type") + Services.statsd_client.increment("specialist_finder_index.unknown-document-type") end def process_response(response, messages) messages_with_error = [] if response["items"].count > 1 - Services.statsd_client.increment("govuk_index.elasticsearch.multiple_responses") + Services.statsd_client.increment("specialist_finder_index.elasticsearch.multiple_responses") end if response["items"].count != messages.count @@ -121,7 +96,7 @@ def process_response(response, messages) end response["items"].zip(messages).each do |response_for_message, message| - messages_with_error << message unless Index::ResponseValidator.new(namespace: "govuk_index").valid?(response_for_message) + messages_with_error << message unless Index::ResponseValidator.new(namespace: "specialist_finder_index").valid?(response_for_message) end if messages_with_error.any? diff --git a/lib/specialist_finder_index/publishing_event_processor.rb b/lib/specialist_finder_index/publishing_event_processor.rb index 73f3fb385..6e21c26d7 100644 --- a/lib/specialist_finder_index/publishing_event_processor.rb +++ b/lib/specialist_finder_index/publishing_event_processor.rb @@ -3,7 +3,7 @@ class PublishingEventProcessor def process(messages) messages = Array(messages) # treat a single message as an array with one value - Services.statsd_client.increment("govuk_index.rabbit-mq-consumed") + Services.statsd_client.increment("specialist_finder_index.rabbit-mq-consumed") PublishingEventJob.perform_async(messages.map { |msg| [msg.delivery_info[:routing_key], msg.payload] }) messages.each(&:ack) end From 0513977df1e9a9e285885f6b616c20e37c440142 Mon Sep 17 00:00:00 2001 From: Minno Dang Date: Tue, 15 Oct 2024 16:42:37 +0100 Subject: [PATCH 3/8] Switching specialist finder tests over to use specialist finder index The document type is no longer mapped from publishing API, so we also shouldn't need to specifically map the format. --- .../presenters/common_fields_presenter.rb | 1 - lib/rummager.rb | 7 ++++++- lib/rummager/app.rb | 2 +- .../specialist_formats_spec.rb | 19 +++++++------------ spec/support/index_helpers.rb | 4 ++-- .../specialist_formats_spec.rb | 4 ++-- 6 files changed, 18 insertions(+), 19 deletions(-) rename spec/integration/{govuk_index => specialist_finder_index}/specialist_formats_spec.rb (78%) rename spec/unit/{govuk_index => specialist_finder_index}/specialist_formats_spec.rb (98%) diff --git a/lib/govuk_index/presenters/common_fields_presenter.rb b/lib/govuk_index/presenters/common_fields_presenter.rb index f532ab6e0..ede465646 100644 --- a/lib/govuk_index/presenters/common_fields_presenter.rb +++ b/lib/govuk_index/presenters/common_fields_presenter.rb @@ -1,7 +1,6 @@ module GovukIndex class CommonFieldsPresenter CUSTOM_FORMAT_MAP = { - "esi_fund" => "european_structural_investment_fund", "external_content" => "recommended-link", "service_manual_homepage" => "service_manual_guide", "service_manual_service_standard" => "service_manual_guide", diff --git a/lib/rummager.rb b/lib/rummager.rb index 9f347fef3..a62d52955 100644 --- a/lib/rummager.rb +++ b/lib/rummager.rb @@ -87,7 +87,9 @@ require "govuk_index/updater" require "govuk_index/client" +require "specialist_finder_index/client" require "govuk_index/document_type_mapper" +require "specialist_finder_index/document_type_mapper" require "govuk_index/page_traffic_job" require "govuk_index/method_builder" require "govuk_index/indexable_content_sanitiser" @@ -101,12 +103,15 @@ require "govuk_index/presenters/elasticsearch_identity" require "govuk_index/presenters/elasticsearch_delete_presenter" require "govuk_index/presenters/elasticsearch_presenter" +require "specialist_finder_index/presenters/elasticsearch_presenter" require "govuk_index/presenters/expanded_links_presenter" require "govuk_index/presenters/indexable_content_presenter" require "govuk_index/presenters/parts_presenter" -require "govuk_index/presenters/specialist_presenter" +require "specialist_finder_index/presenters/specialist_presenter" require "govuk_index/publishing_event_processor" +require "specialist_finder_index/publishing_event_processor" require "govuk_index/publishing_event_job" +require "specialist_finder_index/publishing_event_job" require "govuk_index/supertype_updater" require "govuk_index/supertype_job" require "govuk_message_queue_consumer" diff --git a/lib/rummager/app.rb b/lib/rummager/app.rb index f3cbecf82..1b04dfb01 100644 --- a/lib/rummager/app.rb +++ b/lib/rummager/app.rb @@ -66,7 +66,7 @@ def require_authentication(permission) end def prevent_access_to_govuk - if index_name == "govuk" + if %w[govuk specialist-finder].include?(index_name) halt(403, "Actions to govuk index are not allowed via this endpoint, please use the message queue to update this index") end end diff --git a/spec/integration/govuk_index/specialist_formats_spec.rb b/spec/integration/specialist_finder_index/specialist_formats_spec.rb similarity index 78% rename from spec/integration/govuk_index/specialist_formats_spec.rb rename to spec/integration/specialist_finder_index/specialist_formats_spec.rb index e59e63ad7..8738edd0c 100644 --- a/spec/integration/govuk_index/specialist_formats_spec.rb +++ b/spec/integration/specialist_finder_index/specialist_formats_spec.rb @@ -7,7 +7,7 @@ consumer = GovukMessageQueueConsumer::Consumer.new( queue_name: "bigwig.test", - processor: GovukIndex::PublishingEventProcessor.new, + processor: SpecialistFinderIndex::PublishingEventProcessor.new, rabbitmq_connection: bunny_mock, ) @@ -21,11 +21,9 @@ payload: { document_type: "finder" }, ) - allow(GovukIndex::MigratedFormats).to receive(:indexable_formats).and_return("finder" => :all) - @queue.publish(random_example.to_json, content_type: "application/json") - expect_document_is_in_rummager({ "link" => random_example["base_path"] }, index: "govuk_test", type: "edition") + expect_document_is_in_rummager({ "link" => random_example["base_path"] }, index: "specialist-finder_test", type: "finder") end it "specialist documents are correctly indexed" do @@ -60,30 +58,27 @@ schema: "specialist_document", payload: { document_type: specialist_document_type }, ) - allow(GovukIndex::MigratedFormats).to receive(:indexable_formats).and_return(specialist_document_type => :all) @queue.publish(random_example.to_json, content_type: "application/json") - expect_document_is_in_rummager({ "link" => random_example["base_path"] }, index: "govuk_test", type: specialist_document_type) + expect_document_is_in_rummager({ "link" => random_example["base_path"] }, index: "specialist-finder_test", type: specialist_document_type) end end it "esi documents are correctly indexed" do publisher_document_type = "esi_fund" - search_document_type = "european_structural_investment_fund" random_example = generate_random_example( schema: "specialist_document", payload: { document_type: publisher_document_type }, ) - allow(GovukIndex::MigratedFormats).to receive(:indexable_formats).and_return(search_document_type => :all) @queue.publish(random_example.to_json, content_type: "application/json") expect_document_is_in_rummager( - { "link" => random_example["base_path"], "format" => search_document_type }, - index: "govuk_test", - type: search_document_type, + { "link" => random_example["base_path"], "format" => publisher_document_type }, + index: "specialist-finder_test", + type: publisher_document_type, ) end @@ -96,7 +91,7 @@ @queue.publish(random_example.to_json, content_type: "application/json") expect { - fetch_document_from_rummager(id: random_example["base_path"], index: "govuk_test") + fetch_document_from_rummager(id: random_example["base_path"], index: "specialist-finder_test") }.to raise_error(Elasticsearch::Transport::Transport::Errors::NotFound) end end diff --git a/spec/support/index_helpers.rb b/spec/support/index_helpers.rb index 35459a3d2..d5657e232 100644 --- a/spec/support/index_helpers.rb +++ b/spec/support/index_helpers.rb @@ -13,7 +13,7 @@ def self.all_index_names end def self.clean_all - all_index_names.each do |index_name| + all_index_names.append(SearchConfig.specialist_finder_index_name).each do |index_name| clean_index_group(index_name) end end @@ -36,7 +36,7 @@ def self.clean_index_group(index_name) end def self.create_all - all_index_names.each do |index| + all_index_names.append(SearchConfig.specialist_finder_index_name).each do |index| create_test_index(index) end end diff --git a/spec/unit/govuk_index/specialist_formats_spec.rb b/spec/unit/specialist_finder_index/specialist_formats_spec.rb similarity index 98% rename from spec/unit/govuk_index/specialist_formats_spec.rb rename to spec/unit/specialist_finder_index/specialist_formats_spec.rb index 4b9336f19..2d05099df 100644 --- a/spec/unit/govuk_index/specialist_formats_spec.rb +++ b/spec/unit/specialist_finder_index/specialist_formats_spec.rb @@ -1,6 +1,6 @@ require "spec_helper" -RSpec.describe GovukIndex::ElasticsearchPresenter, "Specialist formats" do +RSpec.describe SpecialistFinderIndex::ElasticsearchPresenter, "Specialist formats" do before do allow_any_instance_of(Indexer::PopularityLookup).to receive(:lookup_popularities).and_return({}) end @@ -256,7 +256,7 @@ def build_example_with_metadata(metadata) payload end - type_mapper = GovukIndex::DocumentTypeMapper.new(example) + type_mapper = SpecialistFinderIndex::DocumentTypeMapper.new(example) described_class.new(payload: example, type_mapper:).document end From 0304f08a4033953d3ae79c6eadc37df4c56642d5 Mon Sep 17 00:00:00 2001 From: Minno Dang Date: Tue, 15 Oct 2024 16:44:23 +0100 Subject: [PATCH 4/8] Removing specialist finder specific fields from Govuk Index presenters --- .../presenters/elasticsearch_presenter.rb | 151 ---------------- .../presenters/specialist_presenter.rb | 167 ------------------ 2 files changed, 318 deletions(-) delete mode 100644 lib/govuk_index/presenters/specialist_presenter.rb diff --git a/lib/govuk_index/presenters/elasticsearch_presenter.rb b/lib/govuk_index/presenters/elasticsearch_presenter.rb index 40b315684..b1446abbc 100644 --- a/lib/govuk_index/presenters/elasticsearch_presenter.rb +++ b/lib/govuk_index/presenters/elasticsearch_presenter.rb @@ -13,200 +13,53 @@ def type def document { - ai_assurance_technique: specialist.ai_assurance_technique, - aircraft_category: specialist.aircraft_category, - aircraft_type: specialist.aircraft_type, - alert_type: specialist.alert_type, - algorithmic_transparency_record_atrs_version: specialist.algorithmic_transparency_record_atrs_version, - algorithmic_transparency_record_capability: specialist.algorithmic_transparency_record_capability, - algorithmic_transparency_record_date_published: specialist.algorithmic_transparency_record_date_published, - algorithmic_transparency_record_function: specialist.algorithmic_transparency_record_function, - algorithmic_transparency_record_organisation: specialist.algorithmic_transparency_record_organisation, - algorithmic_transparency_record_organisation_type: specialist.algorithmic_transparency_record_organisation_type, - algorithmic_transparency_record_other_tags: specialist.algorithmic_transparency_record_other_tags, - algorithmic_transparency_record_phase: specialist.algorithmic_transparency_record_phase, - algorithmic_transparency_record_region: specialist.algorithmic_transparency_record_region, - algorithmic_transparency_record_task: specialist.algorithmic_transparency_record_task, - areas_of_interest: specialist.areas_of_interest, - assessment_date: specialist.assessment_date, - assurance_technique_approach: specialist.assurance_technique_approach, attachments: common_fields.attachments, - authors: specialist.authors, - business_sizes: specialist.business_sizes, - business_stages: specialist.business_stages, - case_state: specialist.case_state, - case_type: specialist.case_type, - category: specialist.category, - certificate_status: specialist.certificate_status, - class_category: specialist.class_category, - closed_date: specialist.closed_date, - closing_date: specialist.closing_date, - commodity_type: specialist.commodity_type, contact_groups: details.contact_groups, content_id: common_fields.content_id, content_purpose_subgroup: common_fields.content_purpose_subgroup, content_purpose_supergroup: common_fields.content_purpose_supergroup, content_store_document_type: common_fields.content_store_document_type, - continuation_link: specialist.continuation_link, - country: specialist.country, - country_of_origin: specialist.country_of_origin, - date_application: specialist.date_application, - date_of_completion: specialist.date_of_completion, - date_of_occurrence: specialist.date_of_occurrence, - date_of_start: specialist.date_of_start, - date_registration: specialist.date_registration, - date_registration_eu: specialist.date_registration_eu, - decision_subject: specialist.decision_subject, description: common_fields.description, - destination_country: specialist.destination_country, - development_sector: specialist.development_sector, - digital_market_research_area: specialist.digital_market_research_area, - digital_market_research_category: specialist.digital_market_research_category, - digital_market_research_publish_date: specialist.digital_market_research_publish_date, - digital_market_research_publisher: specialist.digital_market_research_publisher, - digital_market_research_topic: specialist.digital_market_research_topic, - disease_case_closed_date: specialist.disease_case_closed_date, - disease_case_opened_date: specialist.disease_case_opened_date, - disease_type: specialist.disease_type, document_type: type, - eligible_entities: specialist.eligible_entities, email_document_supertype: common_fields.email_document_supertype, - first_published_at: specialist.first_published_at, - flood_and_coastal_erosion_category: specialist.flood_and_coastal_erosion_category, format: common_fields.format, - fund_state: specialist.fund_state, - fund_type: specialist.fund_type, - funding_amount: specialist.funding_amount, - funding_source: specialist.funding_source, government_document_supertype: common_fields.government_document_supertype, government_name: common_fields.government_name, - grant_type: specialist.grant_type, - hidden_indexable_content: specialist.hidden_indexable_content, hmrc_manual_section_id: common_fields.section_id, image_url:, indexable_content: indexable.indexable_content, - industries: specialist.industries, - internal_notes: specialist.internal_notes, is_historic: common_fields.historic?, is_political: common_fields.political?, is_withdrawn: common_fields.withdrawn?, - issued_date: specialist.issued_date, - keyword: specialist.keyword, - key_function: specialist.key_function, - laid_date: specialist.laid_date, - land_use: specialist.land_use, - land_types: specialist.land_types, latest_change_note: details.latest_change_note, licence_identifier: details.licence_identifier, - licence_transaction_continuation_link: specialist.licence_transaction_continuation_link, - licence_transaction_industry: specialist.licence_transaction_industry, - licence_transaction_licence_identifier: specialist.licence_transaction_licence_identifier, - licence_transaction_location: specialist.licence_transaction_location, - licence_transaction_will_continue_on: specialist.licence_transaction_will_continue_on, licence_short_description: details.licence_short_description, - life_saving_maritime_appliance_service_station_regions: specialist.life_saving_maritime_appliance_service_station_regions, - life_saving_maritime_appliance_type: specialist.life_saving_maritime_appliance_type, - life_saving_maritime_appliance_manufacturer: specialist.life_saving_maritime_appliance_manufacturer, link: common_fields.link, - location: specialist.location, mainstream_browse_page_content_ids: expanded_links.mainstream_browse_page_content_ids, mainstream_browse_pages: expanded_links.mainstream_browse_pages, manual: details.parent_manual, - marine_notice_topic: specialist.marine_notice_topic, - marine_notice_type: specialist.marine_notice_type, - marine_notice_vessel_type: specialist.marine_notice_vessel_type, - market_sector: specialist.market_sector, - medical_specialism: specialist.medical_specialism, - opened_date: specialist.opened_date, organisation_content_ids: expanded_links.organisation_content_ids, organisations: expanded_links.organisations, - outcome_type: specialist.outcome_type, part_of_taxonomy_tree: expanded_links.part_of_taxonomy_tree, parts: parts.presented_parts, - payment_types: specialist.payment_types, people: expanded_links.people, policy_groups: expanded_links.policy_groups, popularity: common_fields.popularity, popularity_b: common_fields.popularity_b, primary_publishing_organisation: expanded_links.primary_publishing_organisation, - principle: specialist.principle, - product_alert_type: specialist.product_alert_type, - product_category: specialist.product_category, - product_measure_type: specialist.product_measure_type, - product_recall_alert_date: specialist.product_recall_alert_date, - product_risk_level: specialist.product_risk_level, - project_code: specialist.project_code, - project_status: specialist.project_status, - protection_type: specialist.protection_type, public_timestamp: common_fields.public_timestamp, publishing_app: common_fields.publishing_app, - railway_type: specialist.railway_type, - reason_for_protection: specialist.reason_for_protection, - reference_number: specialist.reference_number, - regions: specialist.regions, - register: specialist.register, - registered_name: specialist.registered_name, - registration: specialist.registration, rendering_app: common_fields.rendering_app, - report_type: specialist.report_type, - research_document_type: specialist.research_document_type, - result: specialist.result, - review_status: specialist.review_status, role_appointments: expanded_links.role_appointments, roles: expanded_links.roles, - sector: specialist.sector, - service_provider: specialist.service_provider, - sift_end_date: specialist.sift_end_date, - sifting_status: specialist.sifting_status, slug:, - stage: specialist.stage, - status: specialist.status, - subject: specialist.subject, taxons: expanded_links.taxons, - theme: specialist.theme, - therapeutic_area: specialist.therapeutic_area, - tiers_or_standalone_items: specialist.tiers_or_standalone_items, - time_registration: specialist.time_registration, title: common_fields.title, topical_events: expanded_links.topical_events, - topics: specialist.topics, - traditional_term_grapevine_product_category: specialist.traditional_term_grapevine_product_category, - traditional_term_language: specialist.traditional_term_language, - traditional_term_type: specialist.traditional_term_type, - tribunal_decision_categories: specialist.tribunal_decision_categories, - tribunal_decision_category: specialist.tribunal_decision_category, - tribunal_decision_country: specialist.tribunal_decision_country, - tribunal_decision_decision_date: specialist.tribunal_decision_decision_date, - tribunal_decision_judges: specialist.tribunal_decision_judges, - tribunal_decision_landmark: specialist.tribunal_decision_landmark, - tribunal_decision_reference_number: specialist.tribunal_decision_reference_number, - tribunal_decision_sub_categories: specialist.tribunal_decision_sub_categories, - tribunal_decision_sub_category: specialist.tribunal_decision_sub_category, - types_of_support: specialist.types_of_support, updated_at: common_fields.updated_at, - use_case: specialist.use_case, user_journey_document_supertype: common_fields.user_journey_document_supertype, - value_of_funding: specialist.value_of_funding, - vessel_type: specialist.vessel_type, - veterans_support_organisation_health_and_social_care: specialist.veterans_support_organisation_health_and_social_care, - veterans_support_organisation_finance: specialist.veterans_support_organisation_finance, - veterans_support_organisation_legal_and_justice: specialist.veterans_support_organisation_legal_and_justice, - veterans_support_organisation_employment_education_and_training: specialist.veterans_support_organisation_employment_education_and_training, - veterans_support_organisation_housing: specialist.veterans_support_organisation_housing, - veterans_support_organisation_families_and_children: specialist.veterans_support_organisation_families_and_children, - veterans_support_organisation_community_and_social: specialist.veterans_support_organisation_community_and_social, - veterans_support_organisation_region_england: specialist.veterans_support_organisation_region_england, - veterans_support_organisation_region_northern_ireland: specialist.veterans_support_organisation_region_northern_ireland, - veterans_support_organisation_region_scotland: specialist.veterans_support_organisation_region_scotland, - veterans_support_organisation_region_wales: specialist.veterans_support_organisation_region_wales, view_count: common_fields.view_count, - virus_strain: specialist.virus_strain, - will_continue_on: specialist.will_continue_on, - withdrawn_date: specialist.withdrawn_date, world_locations: expanded_links.world_locations, - year_adopted: specialist.year_adopted, - zone_restriction: specialist.zone_restriction, - zone_type: specialist.zone_type, }.reject { |_, v| v.nil? } end @@ -283,10 +136,6 @@ def expanded_links @expanded_links ||= ExpandedLinksPresenter.new(payload["expanded_links"]) end - def specialist - @specialist ||= SpecialistPresenter.new(payload) - end - def newslike? return false if common_fields.content_store_document_type == "fatality_notice" diff --git a/lib/govuk_index/presenters/specialist_presenter.rb b/lib/govuk_index/presenters/specialist_presenter.rb deleted file mode 100644 index 415a1411a..000000000 --- a/lib/govuk_index/presenters/specialist_presenter.rb +++ /dev/null @@ -1,167 +0,0 @@ -module GovukIndex - class SpecialistPresenter - extend MethodBuilder - - set_payload_method :metadata - - delegate_to_payload :ai_assurance_technique, convert_to_array: true - delegate_to_payload :aircraft_category - delegate_to_payload :aircraft_type - delegate_to_payload :alert_type, convert_to_array: true - delegate_to_payload :algorithmic_transparency_record_atrs_version - delegate_to_payload :algorithmic_transparency_record_capability, convert_to_array: true - delegate_to_payload :algorithmic_transparency_record_date_published - delegate_to_payload :algorithmic_transparency_record_function, convert_to_array: true - delegate_to_payload :algorithmic_transparency_record_organisation - delegate_to_payload :algorithmic_transparency_record_organisation_type, convert_to_array: true - delegate_to_payload :algorithmic_transparency_record_other_tags - delegate_to_payload :algorithmic_transparency_record_phase - delegate_to_payload :algorithmic_transparency_record_region, convert_to_array: true - delegate_to_payload :algorithmic_transparency_record_task - delegate_to_payload :areas_of_interest - delegate_to_payload :assessment_date - delegate_to_payload :assurance_technique_approach, convert_to_array: true - delegate_to_payload :authors - delegate_to_payload :business_sizes - delegate_to_payload :business_stages - delegate_to_payload :category, convert_to_array: true - delegate_to_payload :case_state, convert_to_array: true - delegate_to_payload :case_type, convert_to_array: true - delegate_to_payload :certificate_status - delegate_to_payload :class_category - delegate_to_payload :closed_date - delegate_to_payload :closing_date - delegate_to_payload :commodity_type - delegate_to_payload :continuation_link - delegate_to_payload :country - delegate_to_payload :country_of_origin - delegate_to_payload :date_application - delegate_to_payload :date_of_completion - delegate_to_payload :date_of_occurrence - delegate_to_payload :date_of_start - delegate_to_payload :date_registration - delegate_to_payload :date_registration_eu - delegate_to_payload :decision_subject - delegate_to_payload :destination_country, convert_to_array: true - delegate_to_payload :development_sector - delegate_to_payload :digital_market_research_area, convert_to_array: true - delegate_to_payload :digital_market_research_category - delegate_to_payload :digital_market_research_publish_date - delegate_to_payload :digital_market_research_publisher, convert_to_array: true - delegate_to_payload :digital_market_research_topic, convert_to_array: true - delegate_to_payload :disease_case_closed_date - delegate_to_payload :disease_case_opened_date - delegate_to_payload :disease_type, convert_to_array: true - delegate_to_payload :eligible_entities - delegate_to_payload :flood_and_coastal_erosion_category - delegate_to_payload :fund_state, convert_to_array: true - delegate_to_payload :fund_type - delegate_to_payload :funding_amount - delegate_to_payload :funding_source - delegate_to_payload :grant_type, convert_to_array: true - delegate_to_payload :hidden_indexable_content - delegate_to_payload :industries - delegate_to_payload :internal_notes - delegate_to_payload :issued_date - delegate_to_payload :key_function, convert_to_array: true - delegate_to_payload :keyword - delegate_to_payload :laid_date - delegate_to_payload :land_types - delegate_to_payload :land_use - delegate_to_payload :licence_transaction_continuation_link - delegate_to_payload :licence_transaction_industry, convert_to_array: true - delegate_to_payload :licence_transaction_licence_identifier - delegate_to_payload :licence_transaction_location, convert_to_array: true - delegate_to_payload :licence_transaction_will_continue_on - delegate_to_payload :life_saving_maritime_appliance_service_station_regions, convert_to_array: true - delegate_to_payload :life_saving_maritime_appliance_type, convert_to_array: true - delegate_to_payload :life_saving_maritime_appliance_manufacturer, convert_to_array: true - delegate_to_payload :location, convert_to_array: true - delegate_to_payload :marine_notice_topic - delegate_to_payload :marine_notice_type - delegate_to_payload :marine_notice_vessel_type - delegate_to_payload :market_sector - delegate_to_payload :medical_specialism - delegate_to_payload :opened_date - delegate_to_payload :outcome_type - delegate_to_payload :payment_types - delegate_to_payload :principle, convert_to_array: true - delegate_to_payload :product_alert_type - delegate_to_payload :product_category - delegate_to_payload :product_measure_type - delegate_to_payload :product_recall_alert_date - delegate_to_payload :product_risk_level - delegate_to_payload :project_code - delegate_to_payload :project_status - delegate_to_payload :protection_type - delegate_to_payload :railway_type - delegate_to_payload :reason_for_protection - delegate_to_payload :reference_number - delegate_to_payload :regions - delegate_to_payload :register - delegate_to_payload :registered_name - delegate_to_payload :registration - delegate_to_payload :report_type, convert_to_array: true - delegate_to_payload :research_document_type - delegate_to_payload :result - delegate_to_payload :review_status - delegate_to_payload :sector, convert_to_array: true - delegate_to_payload :service_provider - delegate_to_payload :sift_end_date - delegate_to_payload :sifting_status - delegate_to_payload :stage - delegate_to_payload :status - delegate_to_payload :subject - delegate_to_payload :theme - delegate_to_payload :therapeutic_area - delegate_to_payload :tiers_or_standalone_items - delegate_to_payload :time_registration - delegate_to_payload :topics - delegate_to_payload :traditional_term_grapevine_product_category - delegate_to_payload :traditional_term_language - delegate_to_payload :traditional_term_type - delegate_to_payload :tribunal_decision_categories - delegate_to_payload :tribunal_decision_category - delegate_to_payload :tribunal_decision_country - delegate_to_payload :tribunal_decision_decision_date - delegate_to_payload :tribunal_decision_judges - delegate_to_payload :tribunal_decision_landmark - delegate_to_payload :tribunal_decision_reference_number - delegate_to_payload :tribunal_decision_sub_categories - delegate_to_payload :tribunal_decision_sub_category - delegate_to_payload :types_of_support - delegate_to_payload :use_case, convert_to_array: true - delegate_to_payload :value_of_funding - delegate_to_payload :vessel_type - delegate_to_payload :veterans_support_organisation_health_and_social_care - delegate_to_payload :veterans_support_organisation_finance - delegate_to_payload :veterans_support_organisation_legal_and_justice - delegate_to_payload :veterans_support_organisation_employment_education_and_training - delegate_to_payload :veterans_support_organisation_housing - delegate_to_payload :veterans_support_organisation_families_and_children - delegate_to_payload :veterans_support_organisation_community_and_social - delegate_to_payload :veterans_support_organisation_region_england - delegate_to_payload :veterans_support_organisation_region_northern_ireland - delegate_to_payload :veterans_support_organisation_region_scotland - delegate_to_payload :veterans_support_organisation_region_wales - delegate_to_payload :virus_strain - delegate_to_payload :will_continue_on - delegate_to_payload :withdrawn_date - delegate_to_payload :year_adopted - delegate_to_payload :zone_restriction - delegate_to_payload :zone_type, convert_to_array: true - - def initialize(payload) - @payload = payload - @metadata = @payload.dig("details", "metadata") || {} - end - - def first_published_at - metadata["first_published_at"] || @payload["first_published_at"] - end - - private - - attr_reader :metadata - end -end From 5da63ebf098950d590d50adc45567ceb25ed0764 Mon Sep 17 00:00:00 2001 From: Minno Dang Date: Wed, 16 Oct 2024 09:22:18 +0100 Subject: [PATCH 5/8] Add queue listeners for specialist documents and finders To direct it to Specialist Finder index. Since we are only setting up to listen to specialist documents and finders, we no longer have to specifically ignore finder email signups anymore. --- Procfile | 1 + .../publishing_event_job.rb | 8 +---- lib/tasks/message_queue.rake | 14 +++++++++ .../specialist_formats_spec.rb | 31 +------------------ 4 files changed, 17 insertions(+), 37 deletions(-) diff --git a/Procfile b/Procfile index a577ec046..809d483fa 100644 --- a/Procfile +++ b/Procfile @@ -2,4 +2,5 @@ web: bundle exec unicorn -c ./config/unicorn.rb -p ${PORT:-3233} worker: bundle exec sidekiq -C ./config/sidekiq.yml publishing-queue-listener: bundle exec rake message_queue:listen_to_publishing_queue govuk-index-queue-listener: bundle exec rake message_queue:insert_data_into_govuk +specialist-finder-index-queue-listener: bundle exec rake message_queue:insert_data_into_specialist_finder bulk-reindex-queue-listener: bundle exec rake message_queue:bulk_insert_data_into_govuk diff --git a/lib/specialist_finder_index/publishing_event_job.rb b/lib/specialist_finder_index/publishing_event_job.rb index 2e2f26b8b..02bbbdd22 100644 --- a/lib/specialist_finder_index/publishing_event_job.rb +++ b/lib/specialist_finder_index/publishing_event_job.rb @@ -38,10 +38,6 @@ def perform(messages) private - NON_INDEXED_PAGES = %w[ - finder_email_signup - ].freeze - def process_action(processor, routing_key, payload) logger.debug("Processing #{routing_key}: #{payload}") Services.statsd_client.increment("specialist_finder_index.sidekiq-consumed") @@ -61,9 +57,7 @@ def process_action(processor, routing_key, payload) identifier = "#{presenter.link} #{presenter.type || "'unmapped type'"}" - if NON_INDEXED_PAGES.include? type_mapper.type - logger.info("#{routing_key} -> IGNORE #{identifier}") - elsif type_mapper.unpublishing_type? + if type_mapper.unpublishing_type? logger.info("#{routing_key} -> DELETE #{identifier}") processor.delete(presenter) else diff --git a/lib/tasks/message_queue.rake b/lib/tasks/message_queue.rake index f2e755ebe..0743438c0 100644 --- a/lib/tasks/message_queue.rake +++ b/lib/tasks/message_queue.rake @@ -10,6 +10,8 @@ namespace :message_queue do channel.queue("search_api_to_be_indexed").bind(exch, routing_key: "*.links") channel.queue("search_api_bulk_reindex").bind(exch, routing_key: "*.bulk.reindex") channel.queue("search_api_govuk_index").bind(exch, routing_key: "*.*") + channel.queue("search_api_specialist_finder_index_documents").bind(exch, routing_key: "specialist_document.*") + channel.queue("search_api_specialist_finder_index_finders").bind(exch, routing_key: "finder.*") end desc "Index documents that are published to the publishing-api" @@ -28,6 +30,18 @@ namespace :message_queue do ).run end + desc "Gets data from RabbitMQ and insert into specialist finder index" + task :insert_data_into_specialist_finder do + GovukMessageQueueConsumer::Consumer.new( + queue_name: "search_api_specialist_finder_index_documents", + processor: SpecialistFinderIndex::PublishingEventProcessor.new, + ).run + GovukMessageQueueConsumer::Consumer.new( + queue_name: "search_api_specialist_finder_index_finders", + processor: SpecialistFinderIndex::PublishingEventProcessor.new, + ).run + end + desc "Gets data from RabbitMQ and insert into govuk index (bulk reindex queue)" task :bulk_insert_data_into_govuk do GovukMessageQueueConsumer::Consumer.new( diff --git a/spec/integration/specialist_finder_index/specialist_formats_spec.rb b/spec/integration/specialist_finder_index/specialist_formats_spec.rb index 8738edd0c..21939c780 100644 --- a/spec/integration/specialist_finder_index/specialist_formats_spec.rb +++ b/spec/integration/specialist_finder_index/specialist_formats_spec.rb @@ -49,6 +49,7 @@ statutory_instrument tax_tribunal_decision utaac_decision + esi_fund ] # ideally we would run a test for all document types, but this takes 3 seconds so I have limited @@ -64,34 +65,4 @@ expect_document_is_in_rummager({ "link" => random_example["base_path"] }, index: "specialist-finder_test", type: specialist_document_type) end end - - it "esi documents are correctly indexed" do - publisher_document_type = "esi_fund" - - random_example = generate_random_example( - schema: "specialist_document", - payload: { document_type: publisher_document_type }, - ) - - @queue.publish(random_example.to_json, content_type: "application/json") - - expect_document_is_in_rummager( - { "link" => random_example["base_path"], "format" => publisher_document_type }, - index: "specialist-finder_test", - type: publisher_document_type, - ) - end - - it "finders email signup are never indexed" do - random_example = generate_random_example( - schema: "finder_email_signup", - payload: { document_type: "finder_email_signup" }, - ) - - @queue.publish(random_example.to_json, content_type: "application/json") - - expect { - fetch_document_from_rummager(id: random_example["base_path"], index: "specialist-finder_test") - }.to raise_error(Elasticsearch::Transport::Transport::Errors::NotFound) - end end From daa2c103626a10995bc410ab228d299e4e018a3e Mon Sep 17 00:00:00 2001 From: Minno Dang Date: Wed, 16 Oct 2024 12:42:09 +0100 Subject: [PATCH 6/8] Add specialist finder index as part of all indices for creation Previously we thought we should exclude specialist finder index from all indices in case there are stats that are dependent on these indices just in case we are double counting documents. However, it looks like the monitoring task just sends all the data to statsd and the stats ruby file under bin only looks at govuk only anyways and not specialist finder index. We should be good to add specialist finder index as part of "all indeces" --- lib/search_config.rb | 2 +- spec/support/index_helpers.rb | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/search_config.rb b/lib/search_config.rb index ebe2e65f4..6ef9c1414 100644 --- a/lib/search_config.rb +++ b/lib/search_config.rb @@ -43,7 +43,7 @@ def index_names def all_index_names # this is used to process data in the rake file when `all` is passed in as previous we skipped `govuk` # we can't update index_names at this stage as it is used in multiple spots including the index filtering - content_index_names + auxiliary_index_names + [govuk_index_name] + content_index_names + auxiliary_index_names + [govuk_index_name, specialist_finder_index_name] end def run_search(raw_parameters) diff --git a/spec/support/index_helpers.rb b/spec/support/index_helpers.rb index d5657e232..9a910f2de 100644 --- a/spec/support/index_helpers.rb +++ b/spec/support/index_helpers.rb @@ -9,11 +9,11 @@ def self.setup_test_indexes end def self.all_index_names - SearchConfig.content_index_names + SearchConfig.auxiliary_index_names + [SearchConfig.govuk_index_name] + SearchConfig.content_index_names + SearchConfig.auxiliary_index_names + [SearchConfig.govuk_index_name, SearchConfig.specialist_finder_index_name] end def self.clean_all - all_index_names.append(SearchConfig.specialist_finder_index_name).each do |index_name| + all_index_names.each do |index_name| clean_index_group(index_name) end end @@ -36,7 +36,7 @@ def self.clean_index_group(index_name) end def self.create_all - all_index_names.append(SearchConfig.specialist_finder_index_name).each do |index| + all_index_names.each do |index| create_test_index(index) end end From 41a059393e88762451a884ec809d168011ee8f5e Mon Sep 17 00:00:00 2001 From: Minno Dang Date: Wed, 30 Oct 2024 17:32:20 +0000 Subject: [PATCH 7/8] Remove finder tracking from specialist-finder index The new index should only track specialist documents, as the finder itself is not searchable within the finder and is only needed on govuk wide search --- .../presenters/common_fields_presenter.rb | 1 + .../publishing_event_processor.rb | 2 +- lib/tasks/message_queue.rake | 7 +- .../govuk_index/specialist_formats_spec.rb | 101 ++++++++++++++++++ ...s_spec.rb => specialist_documents_spec.rb} | 32 +++--- .../specialist_formats_spec.rb | 0 6 files changed, 123 insertions(+), 20 deletions(-) create mode 100644 spec/integration/govuk_index/specialist_formats_spec.rb rename spec/integration/specialist_finder_index/{specialist_formats_spec.rb => specialist_documents_spec.rb} (78%) rename spec/unit/specialist_finder_index/{ => presenter}/specialist_formats_spec.rb (100%) diff --git a/lib/govuk_index/presenters/common_fields_presenter.rb b/lib/govuk_index/presenters/common_fields_presenter.rb index ede465646..f532ab6e0 100644 --- a/lib/govuk_index/presenters/common_fields_presenter.rb +++ b/lib/govuk_index/presenters/common_fields_presenter.rb @@ -1,6 +1,7 @@ module GovukIndex class CommonFieldsPresenter CUSTOM_FORMAT_MAP = { + "esi_fund" => "european_structural_investment_fund", "external_content" => "recommended-link", "service_manual_homepage" => "service_manual_guide", "service_manual_service_standard" => "service_manual_guide", diff --git a/lib/specialist_finder_index/publishing_event_processor.rb b/lib/specialist_finder_index/publishing_event_processor.rb index 6e21c26d7..158200431 100644 --- a/lib/specialist_finder_index/publishing_event_processor.rb +++ b/lib/specialist_finder_index/publishing_event_processor.rb @@ -4,7 +4,7 @@ def process(messages) messages = Array(messages) # treat a single message as an array with one value Services.statsd_client.increment("specialist_finder_index.rabbit-mq-consumed") - PublishingEventJob.perform_async(messages.map { |msg| [msg.delivery_info[:routing_key], msg.payload] }) + SpecialistFinderIndex::PublishingEventJob.perform_async(messages.map { |msg| [msg.delivery_info[:routing_key], msg.payload] }) messages.each(&:ack) end end diff --git a/lib/tasks/message_queue.rake b/lib/tasks/message_queue.rake index 0743438c0..06a935b4d 100644 --- a/lib/tasks/message_queue.rake +++ b/lib/tasks/message_queue.rake @@ -9,9 +9,8 @@ namespace :message_queue do exch = Bunny::Exchange.new(channel, :topic, "published_documents") channel.queue("search_api_to_be_indexed").bind(exch, routing_key: "*.links") channel.queue("search_api_bulk_reindex").bind(exch, routing_key: "*.bulk.reindex") - channel.queue("search_api_govuk_index").bind(exch, routing_key: "*.*") channel.queue("search_api_specialist_finder_index_documents").bind(exch, routing_key: "specialist_document.*") - channel.queue("search_api_specialist_finder_index_finders").bind(exch, routing_key: "finder.*") + channel.queue("search_api_govuk_index").bind(exch, routing_key: "*.*") end desc "Index documents that are published to the publishing-api" @@ -36,10 +35,6 @@ namespace :message_queue do queue_name: "search_api_specialist_finder_index_documents", processor: SpecialistFinderIndex::PublishingEventProcessor.new, ).run - GovukMessageQueueConsumer::Consumer.new( - queue_name: "search_api_specialist_finder_index_finders", - processor: SpecialistFinderIndex::PublishingEventProcessor.new, - ).run end desc "Gets data from RabbitMQ and insert into govuk index (bulk reindex queue)" diff --git a/spec/integration/govuk_index/specialist_formats_spec.rb b/spec/integration/govuk_index/specialist_formats_spec.rb new file mode 100644 index 000000000..d648a1f1a --- /dev/null +++ b/spec/integration/govuk_index/specialist_formats_spec.rb @@ -0,0 +1,101 @@ +require "spec_helper" +RSpec.describe "SpecialistFormatsTest" do + before do + bunny_mock = BunnyMock.new + @channel = bunny_mock.start.channel + + consumer = GovukMessageQueueConsumer::Consumer.new( + queue_name: "bigwig.test", + processor: GovukIndex::PublishingEventProcessor.new, + rabbitmq_connection: bunny_mock, + ) + + @queue = @channel.queue("bigwig.test") + consumer.run + end + + it "specialist publisher finders are correctly indexed" do + random_example = generate_random_example( + schema: "finder", + payload: { document_type: "finder" }, + ) + + allow(GovukIndex::MigratedFormats).to receive(:indexable_formats).and_return("finder" => :all) + + @queue.publish(random_example.to_json, content_type: "application/json") + + expect_document_is_in_rummager({ "link" => random_example["base_path"] }, index: "govuk_test", type: "edition") + end + + it "specialist documents are correctly indexed" do + document_types = %w[ + aaib_report + asylum_support_decision + business_finance_support_scheme + cma_case + countryside_stewardship_grant + drug_safety_update + employment_appeal_tribunal_decision + employment_tribunal_decision + flood_and_coastal_erosion_risk_management_research_report + international_development_fund + licence_transaction + maib_report + medical_safety_alert + protected_food_drink_name + raib_report + research_for_development_output + residential_property_tribunal_decision + service_standard_report + statutory_instrument + tax_tribunal_decision + utaac_decision + ] + + # ideally we would run a test for all document types, but this takes 3 seconds so I have limited + # it to a random subset + document_types.sample(3).each do |specialist_document_type| + random_example = generate_random_example( + schema: "specialist_document", + payload: { document_type: specialist_document_type }, + ) + allow(GovukIndex::MigratedFormats).to receive(:indexable_formats).and_return(specialist_document_type => :all) + + @queue.publish(random_example.to_json, content_type: "application/json") + + expect_document_is_in_rummager({ "link" => random_example["base_path"] }, index: "govuk_test", type: specialist_document_type) + end + end + + it "esi documents are correctly indexed" do + publisher_document_type = "esi_fund" + search_document_type = "european_structural_investment_fund" + + random_example = generate_random_example( + schema: "specialist_document", + payload: { document_type: publisher_document_type }, + ) + allow(GovukIndex::MigratedFormats).to receive(:indexable_formats).and_return(search_document_type => :all) + + @queue.publish(random_example.to_json, content_type: "application/json") + + expect_document_is_in_rummager( + { "link" => random_example["base_path"], "format" => search_document_type }, + index: "govuk_test", + type: search_document_type, + ) + end + + it "finders email signup are never indexed" do + random_example = generate_random_example( + schema: "finder_email_signup", + payload: { document_type: "finder_email_signup" }, + ) + + @queue.publish(random_example.to_json, content_type: "application/json") + + expect { + fetch_document_from_rummager(id: random_example["base_path"], index: "govuk_test") + }.to raise_error(Elasticsearch::Transport::Transport::Errors::NotFound) + end +end diff --git a/spec/integration/specialist_finder_index/specialist_formats_spec.rb b/spec/integration/specialist_finder_index/specialist_documents_spec.rb similarity index 78% rename from spec/integration/specialist_finder_index/specialist_formats_spec.rb rename to spec/integration/specialist_finder_index/specialist_documents_spec.rb index 21939c780..7ec78419c 100644 --- a/spec/integration/specialist_finder_index/specialist_formats_spec.rb +++ b/spec/integration/specialist_finder_index/specialist_documents_spec.rb @@ -1,6 +1,6 @@ require "spec_helper" -RSpec.describe "SpecialistFormatTest" do +RSpec.describe "SpecialistDocumentsTest" do before do bunny_mock = BunnyMock.new @channel = bunny_mock.start.channel @@ -15,17 +15,6 @@ consumer.run end - it "specialist publisher finders are correctly indexed" do - random_example = generate_random_example( - schema: "finder", - payload: { document_type: "finder" }, - ) - - @queue.publish(random_example.to_json, content_type: "application/json") - - expect_document_is_in_rummager({ "link" => random_example["base_path"] }, index: "specialist-finder_test", type: "finder") - end - it "specialist documents are correctly indexed" do document_types = %w[ aaib_report @@ -49,7 +38,6 @@ statutory_instrument tax_tribunal_decision utaac_decision - esi_fund ] # ideally we would run a test for all document types, but this takes 3 seconds so I have limited @@ -65,4 +53,22 @@ expect_document_is_in_rummager({ "link" => random_example["base_path"] }, index: "specialist-finder_test", type: specialist_document_type) end end + + it "esi documents are correctly indexed" do + publisher_document_type = "esi_fund" + search_document_type = "european_structural_investment_fund" + + random_example = generate_random_example( + schema: "specialist_document", + payload: { document_type: publisher_document_type }, + ) + + @queue.publish(random_example.to_json, content_type: "application/json") + + expect_document_is_in_rummager( + { "link" => random_example["base_path"], "format" => search_document_type }, + index: "specialist-finder_test", + type: publisher_document_type, + ) + end end diff --git a/spec/unit/specialist_finder_index/specialist_formats_spec.rb b/spec/unit/specialist_finder_index/presenter/specialist_formats_spec.rb similarity index 100% rename from spec/unit/specialist_finder_index/specialist_formats_spec.rb rename to spec/unit/specialist_finder_index/presenter/specialist_formats_spec.rb From 88d7b2003b58095ea91726acc154922d9d5ebfd1 Mon Sep 17 00:00:00 2001 From: Minno Dang Date: Fri, 1 Nov 2024 10:48:33 +0000 Subject: [PATCH 8/8] Add new endpoint to query specialist-finder index New parallel stack to the existing search endpoint so we don't interfere with existing search functionality. When doing the testing, we've noticed that certain things are configured only for government_test index but not for the govuk (and hence also our new index) - such as spelling typo suggestions, and organisation expansion. Should the tests be more explicit about calling this out? --- lib/rummager/app.rb | 21 + lib/search/query_builder.rb | 12 + lib/search_config.rb | 22 + lib/search_server.rb | 3 +- .../search_specialist_documents_spec.rb | 545 ++++++++++++++++++ .../support/search_integration_spec_helper.rb | 40 +- 6 files changed, 622 insertions(+), 21 deletions(-) create mode 100644 spec/integration/search/search_specialist_documents_spec.rb diff --git a/lib/rummager/app.rb b/lib/rummager/app.rb index 1b04dfb01..9c7a193b8 100644 --- a/lib/rummager/app.rb +++ b/lib/rummager/app.rb @@ -143,6 +143,27 @@ def json_only halt(500, env["sinatra.error"].message) end + # Return results for the Specialist Finder searches + # + # For details, see docs/search-api.md + ["/specialist-documents-search.?:request_format?", "/api/specialist-documents-search.?:request_format?"].each do |path| + get path do + json_only + + query_params = parse_query_string(request.query_string) + + begin + results = SearchConfig.run_specialist_document_search(query_params) + rescue BaseParameterParser::ParseError => e + status 422 + return { error: e.error }.to_json + end + + headers["Access-Control-Allow-Origin"] = "*" + results.to_json + end + end + # Return results for the GOV.UK site search # # For details, see docs/search-api.md diff --git a/lib/search/query_builder.rb b/lib/search/query_builder.rb index e2bfb75a2..1e945498d 100644 --- a/lib/search/query_builder.rb +++ b/lib/search/query_builder.rb @@ -64,12 +64,24 @@ def query end def filter + return specialist_documents_post_filter if content_index_names.include?(SearchConfig.specialist_finder_index_name) + Search::FormatMigrator.new( search_params.search_config, base_query: QueryComponents::Filter.new(search_params).payload, ).call end + def specialist_documents_post_filter + { bool: + { + minimum_should_match: 1, + should: [{ + bool: { must: QueryComponents::Filter.new(search_params).payload }, + }], + } } + end + private attr_reader :content_index_names, :metasearch_index diff --git a/lib/search_config.rb b/lib/search_config.rb index 6ef9c1414..b9000f426 100644 --- a/lib/search_config.rb +++ b/lib/search_config.rb @@ -52,6 +52,11 @@ def run_search(raw_parameters) search_params.search_config.run_search_with_params(search_params) end + def run_specialist_document_search(raw_parameters) + search_params = parse_parameters(raw_parameters) + search_params.search_config.run_specialist_document_search_with_params(search_params) + end + def run_batch_search(searches) search_params = [] searches.each do |search| @@ -121,6 +126,10 @@ def run_search_with_params(search_params) searcher.run(search_params) end + def run_specialist_document_search_with_params(search_params) + specialist_document_searcher.run(search_params) + end + def run_batch_search_with_params(search_params) batch_searcher.run(search_params) end @@ -149,6 +158,10 @@ def new_content_index @new_content_index ||= search_server.index_for_search([SearchConfig.govuk_index_name]) end + def specialist_documents_content_index + @specialist_documents_content_index ||= search_server.index_for_search(SearchConfig.content_index_names + [SearchConfig.specialist_finder_index_name]) + end + def base_uri cluster.uri end @@ -177,6 +190,15 @@ def searcher ) end + def specialist_document_searcher + @specialist_document_searcher ||= Search::Query.new( + content_index: specialist_documents_content_index, + registries:, + metasearch_index:, + spelling_index:, + ) + end + def batch_searcher @batch_searcher ||= Search::BatchQuery.new( content_index:, diff --git a/lib/search_server.rb b/lib/search_server.rb index 132fe9cf0..839b9801b 100644 --- a/lib/search_server.rb +++ b/lib/search_server.rb @@ -12,6 +12,7 @@ def initialize(base_uri, schema, index_names, govuk_index_name, content_index_na @govuk_index_name = govuk_index_name @content_index_names = content_index_names @search_config = search_config + @specialist_finder_index_name = SearchConfig.specialist_finder_index_name end def index_group(prefix) @@ -52,7 +53,7 @@ def validate_index_name!(index_name) def index_name_valid?(index_name) index_name.split(",").all? do |name| - @index_names.include?(name) || @govuk_index_name == name + @index_names.include?(name) || @govuk_index_name == name || @specialist_finder_index_name == name end end end diff --git a/spec/integration/search/search_specialist_documents_spec.rb b/spec/integration/search/search_specialist_documents_spec.rb new file mode 100644 index 000000000..9701e032e --- /dev/null +++ b/spec/integration/search/search_specialist_documents_spec.rb @@ -0,0 +1,545 @@ +require "spec_helper" +require_relative "../../support/search_integration_spec_helper" + +RSpec.configure do |c| + c.include SearchIntegrationSpecHelper +end + +RSpec.describe "SearchSpecialistDocumentsTest" do + let(:index) { "specialist-finder_test" } + + it "returns success" do + get "/specialist-documents-search?q=important" + + expect(last_response).to be_ok + end + + it "spell checking with typo" do + document_params = { + "slug" => "/ministry-of-magic", + "link" => "/ministry-of-magic-site", + "title" => "Ministry of Magic", + } + commit_document("government_test", document_params) + + get "/specialist-documents-search?q=ministry of magick&suggest=spelling" + + expect(parsed_response["suggested_queries"]).to eq(["ministry of magic"]) + end + + it "highlights spelling suggestions" do + document_params = { + "slug" => "/ministry-of-magic", + "link" => "/ministry-of-magic-site", + "title" => "Ministry of Magic", + } + commit_document("government_test", document_params) + + get "/specialist-documents-search?q=ministry of magick&suggest=spelling_with_highlighting" + + expect(parsed_response["suggested_queries"]).to eq([{ + "text" => "ministry of magic", + "highlighted" => "ministry of magic", + }]) + end + + it "spell checking with blocklisted typo" do + commit_document( + index, + { + "title" => "Brexitt", + "description" => "Brexitt", + "link" => "/brexitt", + }, + ) + + get "/specialist-documents-search?q=brexit&suggest=spelling" + + expect(parsed_response["suggested_queries"]).to eq([]) + end + + it "spell checking without typo" do + add_sample_documents(index, 1) + + get "/specialist-documents-search?q=milliband" + + expect(parsed_response["suggested_queries"]).to eq([]) + end + + it "sort by date ascending" do + add_sample_documents(index, 2) + + get "/specialist-documents-search?q=important&order=public_timestamp" + + expect(result_links.take(2)).to eq(["/specialist-finder-1", "/specialist-finder-2"]) + end + + it "sort by date descending" do + add_sample_documents(index, 2) + + get "/specialist-documents-search?q=important&order=-public_timestamp" + + # The government links have dates, so appear before all the other links. + # The other documents have no dates, so appear in an undefined order + expect(result_links.take(2)).to eq(["/specialist-finder-1", "/specialist-finder-2"]) + end + + it "sort by title ascending" do + add_sample_documents(index, 1) + + get "/specialist-documents-search?order=title" + lowercase_titles = result_titles.map(&:downcase) + + expect(lowercase_titles).to eq(["sample specialist-finder document 1"]) + end + + it "filter by field" do + add_sample_documents(index, 1) + + get "/specialist-documents-search?filter_mainstream_browse_pages=browse/page/1" + + expect(result_links.sort).to eq(["/specialist-finder-1"]) + end + + it "reject by field" do + add_sample_documents(index, 2) + + get "/specialist-documents-search?reject_mainstream_browse_pages=browse/page/1" + + expect(result_links.sort).to eq(["/specialist-finder-2"]) + end + + it "can filter for missing field" do + add_sample_documents(index, 1) + + get "/specialist-documents-search?filter_manual=_MISSING" + + expect(result_links.sort).to eq(["/specialist-finder-1"]) + end + + it "can filter for missing or specific value in field" do + add_sample_documents(index, 1) + + get "/specialist-documents-search?filter_document_type[]=_MISSING&filter_document_type[]=edition" + + expect(result_links.sort).to eq(["/specialist-finder-1"]) + end + + it "can filter and reject" do + add_sample_documents(index, 2) + + get "/specialist-documents-search?reject_mainstream_browse_pages=1&filter_document_type[]=edition" + + expect(result_links.sort).to eq(["/specialist-finder-1", "/specialist-finder-2"]) + end + + describe "filter/reject when an attribute has multiple values" do + before do + commit_document( + index, + { + "link" => "/one", + "part_of_taxonomy_tree" => %w[a b c], + }, + ) + commit_document( + index, + { + "link" => "/two", + "part_of_taxonomy_tree" => %w[d e f], + }, + ) + commit_document( + index, + { + "link" => "/three", + "part_of_taxonomy_tree" => %w[b e], + }, + ) + end + + describe "filter_all" do + it "filters all documents containing taxon b and e" do + get "/specialist-documents-search?filter_all_part_of_taxonomy_tree=b&filter_all_part_of_taxonomy_tree=e" + expect(result_links.sort).to eq([ + "/three", + ]) + end + end + + describe "filter_any" do + it "filters any document containing taxon c or f" do + get "/specialist-documents-search?filter_any_part_of_taxonomy_tree=c&filter_any_part_of_taxonomy_tree=f" + expect(result_links.sort).to match_array([ + "/one", "/two" + ]) + end + end + + describe "reject_all" do + it "rejects all documents containing taxon b and e" do + get "/specialist-documents-search?reject_all_part_of_taxonomy_tree=b&reject_all_part_of_taxonomy_tree=e" + expect(result_links.sort).to match_array([ + "/one", "/two" + ]) + end + end + + describe "reject_any" do + it "rejects any documents containing taxon c or f" do + get "/specialist-documents-search?reject_any_part_of_taxonomy_tree=c&reject_any_part_of_taxonomy_tree=f" + expect(result_links.sort).to match_array([ + "/three", + ]) + end + end + end + + describe "boolean filtering" do + context "when boolean filters are not true or false" do + it "returns an error" do + get "/specialist-documents-search?filter_is_withdrawn=blah" + + expect(last_response.status).to eq(422) + expect(parsed_response).to eq({ "error" => "is_withdrawn requires a boolean (true or false)" }) + end + end + + context "when an invalid filter is used" do + it "returns an error" do + get "/specialist-documents-search?filter_has_some_very_incorrect_filter=false" + + expect(last_response.status).to eq(422) + expect(parsed_response).to eq({ "error" => "\"has_some_very_incorrect_filter\" is not a valid filter field" }) + end + end + + context "when a valid filter is used" do + before do + add_sample_documents(index, 2) + document_params = { + "slug" => "/ministry-of-magic", + "link" => "/ministry-of-magic-site", + "title" => "Ministry of Magic", + "has_official_document" => true, + } + commit_document("government_test", document_params) + + document_params = { + "title" => "Advice on Treatment of Dragons", + "link" => "/dragon-guide", + "has_official_document" => false, + } + commit_document("government_test", document_params) + end + + it "can filter on boolean fields = true" do + get "/specialist-documents-search?filter_has_official_document=true" + expect(result_links.sort).to eq(%w[/ministry-of-magic-site]) + end + + it "can filter on boolean fields = false" do + get "/specialist-documents-search?filter_has_official_document=false" + + expect(result_links.sort).to eq(%w[/dragon-guide]) + end + end + end + + it "only contains fields which are present" do + add_sample_documents(index, 2) + + get "/specialist-documents-search?q=important&order=public_timestamp" + + results = parsed_response["results"] + expect(results[1]["title"]).to eq("Sample specialist-finder document 2") + end + + it "validates integer params" do + get "/specialist-documents-search?start=a" + + expect(last_response.status).to eq(422) + expect(parsed_response).to eq({ "error" => "Invalid value \"a\" for parameter \"start\" (expected positive integer)" }) + end + + it "allows integer params leading zeros" do + get "/specialist-documents-search?start=09" + + expect(last_response).to be_ok + end + + it "validates unknown params" do + get "/specialist-documents-search?foo&bar=1" + + expect(last_response.status).to eq(422) + expect(parsed_response).to eq("error" => "Unexpected parameters: foo, bar") + end + + it "debug explain returns explanations" do + add_sample_documents(index, 1) + + get "/specialist-documents-search?debug=explain" + + first_hit_explain = parsed_response["results"].first["_explanation"] + expect(first_hit_explain).not_to be_nil + expect(first_hit_explain.keys).to include("value") + expect(first_hit_explain.keys).to include("description") + expect(first_hit_explain.keys).to include("details") + end + + it "can scope by elasticsearch type" do + commit_document(index, cma_case_attributes, type: "cma_case") + + get "/specialist-documents-search?filter_document_type=cma_case" + + expect(last_response).to be_ok + expect(parsed_response.fetch("total")).to eq(1) + expect(parsed_response.fetch("results").fetch(0)).to match( + hash_including( + "document_type" => "cma_case", + "title" => cma_case_attributes.fetch("title"), + "link" => cma_case_attributes.fetch("link"), + ), + ) + end + + it "can filter between dates" do + commit_document(index, cma_case_attributes, type: "cma_case") + + get "/specialist-documents-search?filter_document_type=cma_case&filter_opened_date=from:2014-03-31,to:2014-04-02" + + expect(last_response).to be_ok + expect(parsed_response.fetch("total")).to eq(1) + expect(parsed_response.fetch("results").fetch(0)).to match( + hash_including( + "title" => cma_case_attributes.fetch("title"), + "link" => cma_case_attributes.fetch("link"), + ), + ) + end + + it "can filter between dates with reversed parameter order" do + commit_document(index, cma_case_attributes, type: "cma_case") + + get "/specialist-documents-search?filter_document_type=cma_case&filter_opened_date=to:2014-04-02,from:2014-03-31" + + expect(last_response).to be_ok + expect(parsed_response.fetch("total")).to eq(1) + expect(parsed_response.fetch("results").fetch(0)).to match( + hash_including( + "title" => cma_case_attributes.fetch("title"), + "link" => cma_case_attributes.fetch("link"), + ), + ) + end + + it "can filter from date" do + commit_filter_from_date_documents(index) + get "/specialist-documents-search?filter_document_type=cma_case&filter_opened_date=from:2014-03-31" + + expect(last_response).to be_ok + expect_response_includes_matching_date_and_datetime_results(parsed_response.fetch("results")) + end + + it "can filter from time" do + commit_filter_from_time_documents(index) + get "/specialist-documents-search?filter_document_type=cma_case&filter_opened_date=from:2014-03-31 14:00:00" + + expect(last_response).to be_ok + expect_response_includes_matching_date_and_datetime_results(parsed_response.fetch("results")) + end + + it "can filter to date" do + commit_filter_to_date_documents(index) + get "/specialist-documents-search?filter_document_type=cma_case&filter_opened_date=to:2014-04-02" + + expect(last_response).to be_ok + expect_response_includes_matching_date_and_datetime_results(parsed_response.fetch("results")) + end + + it "can filter to time" do + commit_filter_to_time_documents(index) + + get "/specialist-documents-search?filter_document_type=cma_case&filter_opened_date=to:2014-04-02 11:00:00" + + expect(last_response).to be_ok + expect_response_includes_matching_date_and_datetime_results(parsed_response.fetch("results")) + end + + it "can filter times in different time zones" do + commit_document( + index, + cma_case_attributes("opened_date" => "2017-07-01T11:20:00.000-03:00", "link" => "/cma-1"), + type: "cma_case", + ) + commit_document( + index, + cma_case_attributes("opened_date" => "2017-07-02T00:15:00.000+01:00", "link" => "/cma-2"), + type: "cma_case", + ) + + get "/specialist-documents-search?filter_document_type=cma_case&filter_opened_date=from:2017-07-01 12:00,to:2017-07-01 23:30:00" + + expect(last_response).to be_ok + expect(parsed_response.fetch("results")).to contain_exactly( + hash_including("link" => "/cma-1"), + hash_including("link" => "/cma-2"), + ) + end + + it "cannot provide date filter key multiple times" do + get "/specialist-documents-search?filter_document_type=cma_case&filter_opened_date[]=from:2014-03-31&filter_opened_date[]=to:2014-04-02" + + expect(last_response.status).to eq(422) + expect( + parsed_response, + ).to eq( + { "error" => %{Too many values (2) for parameter "opened_date" (must occur at most once)} }, + ) + end + + it "cannot provide invalid dates for date filter" do + get "/specialist-documents-search?filter_document_type=cma_case&filter_opened_date=from:not-a-date" + + expect(last_response.status).to eq(422) + expect( + parsed_response, + ).to eq( + { "error" => %{Invalid "from" value "not-a-date" for parameter "opened_date" (expected ISO8601 date)} }, + ) + end + + it "expands organisations" do + document_params = { + "title" => "Advice on Treatment of Dragons", + "link" => "/dragon-guide", + "organisations" => ["/ministry-of-magic"], + } + commit_document("government_test", document_params) + + document_params = { + "slug" => "/ministry-of-magic", + "link" => "/ministry-of-magic-site", + "title" => "Ministry of Magic", + "format" => "organisation", + } + commit_document("government_test", document_params) + + get "/specialist-documents-search.json?q=dragons" + + expect(first_result["organisations"]).to eq( + [{ "slug" => "/ministry-of-magic", + "link" => "/ministry-of-magic-site", + "title" => "Ministry of Magic" }], + ) + end + + it "also works with the /api prefix" do + document_params = { + "slug" => "/ministry-of-magic", + "link" => "/ministry-of-magic-site", + "title" => "Ministry of Magic", + "format" => "organisation", + } + commit_document("government_test", document_params) + + document_params = { + "title" => "Advice on Treatment of Dragons", + "link" => "/dragon-guide", + "organisations" => ["/ministry-of-magic"], + } + commit_document("government_test", document_params) + + get "/api/specialist-documents-search.json?q=dragons" + + expect(first_result["organisations"]).to eq( + [{ "slug" => "/ministry-of-magic", + "link" => "/ministry-of-magic-site", + "title" => "Ministry of Magic" }], + ) + end + + it "expands organisations via content_id" do + document_params = { + "slug" => "/ministry-of-magic", + "link" => "/ministry-of-magic-site", + "title" => "Ministry of Magic", + "content_id" => "organisation-content-id", + "format" => "organisation", + } + commit_document("government_test", document_params) + + document_params = { + "title" => "Advice on Treatment of Dragons", + "link" => "/dragon-guide", + "organisation_content_ids" => %w[organisation-content-id], + } + commit_document("government_test", document_params) + + get "/specialist-documents-search.json?q=dragons" + + # Adds a new key with the expanded organisations + expect_result_includes_ministry_of_magic_for_key(first_result, "expanded_organisations", "content_id" => "organisation-content-id") + + # Keeps the organisation content ids + expect( + first_result["organisation_content_ids"], + ).to eq( + %w[organisation-content-id], + ) + end + + it "will show the query" do + get "/specialist-documents-search?q=test&debug=show_query" + + expect(parsed_response.fetch("elasticsearch_query")).to be_truthy + end + + it "will show the cluster" do + get "/specialist-documents-search?q=test" + expect(parsed_response.fetch("es_cluster")).to eq(Clusters.default_cluster.key) + + Clusters.active.each do |cluster| + get "/specialist-documents-search?q=test&ab_tests=search_cluster_query:#{cluster.key}" + expect(parsed_response.fetch("es_cluster")).to eq(cluster.key) + end + end + + it "can return the taxonomy" do + document_params = { + "slug" => "/ministry-of-magic", + "link" => "/ministry-of-magic-site", + "title" => "Ministry of Magic", + "taxons" => %w[eb2093ef-778c-4105-9f33-9aa03d14bc5c], + } + commit_document(index, document_params) + + get "/specialist-documents-search?q=Ministry of Magict&fields[]=taxons" + expect(parsed_response.fetch("total")).to eq(1) + + taxons = parsed_response.dig("results", 0, "taxons") + expect(taxons).to eq(%w[eb2093ef-778c-4105-9f33-9aa03d14bc5c]) + end + +private + + def first_result + @first_result ||= parsed_response["results"].first + end + + def result_links + @result_links ||= parsed_response["results"].map do |result| + result["link"] + end + end + + def result_titles + @result_titles ||= parsed_response["results"].map do |result| + result["title"] + end + end + + def es_score_by_link(link) + parsed_response["results"].find { |result| result["link"] == link }["es_score"] + end +end diff --git a/spec/support/search_integration_spec_helper.rb b/spec/support/search_integration_spec_helper.rb index cd12f0c44..596ce53e2 100644 --- a/spec/support/search_integration_spec_helper.rb +++ b/spec/support/search_integration_spec_helper.rb @@ -31,93 +31,93 @@ def cma_case_attributes(attributes = {}) }.merge(attributes) end - def commit_filter_from_date_documents + def commit_filter_from_date_documents(index = "govuk_test") commit_document( - "govuk_test", + index, cma_case_attributes("opened_date" => "2014-03-30", "link" => "/old-cma-with-date"), type: "cma_case", ) commit_document( - "govuk_test", + index, cma_case_attributes("opened_date" => "2014-03-30T23:00:00.000+00:00", "link" => "/old-cma-with-datetime"), type: "cma_case", ) commit_document( - "govuk_test", + index, cma_case_attributes("opened_date" => "2014-03-31", "link" => "/matching-cma-with-date"), type: "cma_case", ) commit_document( - "govuk_test", + index, cma_case_attributes("opened_date" => "2014-03-31T00:00:00.000+00:00", "link" => "/matching-cma-with-datetime"), type: "cma_case", ) end - def commit_filter_from_time_documents + def commit_filter_from_time_documents(index = "govuk_test") commit_document( - "govuk_test", + index, cma_case_attributes("opened_date" => "2014-03-31", "link" => "/old-cma-with-date"), type: "cma_case", ) commit_document( - "govuk_test", + index, cma_case_attributes("opened_date" => "2014-03-31T13:59:59.000+00:00", "link" => "/old-cma-with-datetime"), type: "cma_case", ) commit_document( - "govuk_test", + index, cma_case_attributes("opened_date" => "2014-04-01", "link" => "/matching-cma-with-date"), type: "cma_case", ) commit_document( - "govuk_test", + index, cma_case_attributes("opened_date" => "2014-03-31T14:00:00.000+00:00", "link" => "/matching-cma-with-datetime"), type: "cma_case", ) end - def commit_filter_to_date_documents + def commit_filter_to_date_documents(index = "govuk_test") commit_document( - "govuk_test", + index, cma_case_attributes("opened_date" => "2014-04-02", "link" => "/matching-cma-with-date"), type: "cma_case", ) commit_document( - "govuk_test", + index, cma_case_attributes("opened_date" => "2014-04-02T05:00:00.000+00:00", "link" => "/matching-cma-with-datetime"), type: "cma_case", ) commit_document( - "govuk_test", + index, cma_case_attributes("opened_date" => "2014-04-03", "link" => "/future-cma-with-date"), type: "cma_case", ) commit_document( - "govuk_test", + index, cma_case_attributes("opened_date" => "2014-04-03T00:00:00.000+00:00", "link" => "/future-cma-with-datetime"), type: "cma_case", ) end - def commit_filter_to_time_documents + def commit_filter_to_time_documents(index = "govuk_test") commit_document( - "govuk_test", + index, cma_case_attributes("opened_date" => "2014-04-02", "link" => "/matching-cma-with-date"), type: "cma_case", ) commit_document( - "govuk_test", + index, cma_case_attributes("opened_date" => "2014-04-02T11:00:00.000+00:00", "link" => "/matching-cma-with-datetime"), type: "cma_case", ) commit_document( - "govuk_test", + index, cma_case_attributes("opened_date" => "2014-04-03", "link" => "/future-cma-with-date"), type: "cma_case", ) commit_document( - "govuk_test", + index, cma_case_attributes("opened_date" => "2014-04-02T11:00:01.000+00:00", "link" => "/future-cma-with-datetime"), type: "cma_case", )