diff --git a/.gitignore b/.gitignore index 6031947e..e0fa8aa3 100644 --- a/.gitignore +++ b/.gitignore @@ -26,7 +26,8 @@ yarn-debug.log* /config/puma.rb # Ignore public uploaded files -public/uploads +public/uploads/ +public/sitemaps/ # Ignore public/assets as assets are generated in each server when deploying public/assets @@ -41,9 +42,10 @@ public/sw.js spec/decidim_dummy_app/ .rspec-failures -# Ignore idea config files +# Ignore IDE config files .idea *.iml +.vscode/ coverage/ storage/ diff --git a/Gemfile b/Gemfile index 5d146952..79e6ed57 100644 --- a/Gemfile +++ b/Gemfile @@ -36,6 +36,7 @@ gem "daemons" gem "deface" gem "delayed_job_active_record" +gem "sitemap_generator", "~> 7.0" gem "whenever", require: false gem "recaptcha" diff --git a/Gemfile.lock b/Gemfile.lock index ca03c27e..1ce70633 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -860,6 +860,8 @@ GEM simplecov (~> 0.19) simplecov-html (0.13.2) simplecov_json_formatter (0.1.4) + sitemap_generator (7.0.1) + builder (~> 3.0) smart_properties (1.17.0) snaky_hash (2.0.3) hashie (>= 0.1.0, < 6) @@ -974,6 +976,7 @@ DEPENDENCIES rubocop-faker rubocop-rspec rubocop-rspec_rails + sitemap_generator (~> 7.0) soda-ruby stringio (~> 3.1.7) web-console diff --git a/config/schedule.rb b/config/schedule.rb index 1041b3e9..81ae3317 100644 --- a/config/schedule.rb +++ b/config/schedule.rb @@ -22,6 +22,10 @@ rake "tmp:clear" end +every 1.day, at: "3:15 am" do + rake "sitemap:generate" +end + every 5.minutes do rake "participatory_processes_phases:enqueue_change_active_step" end diff --git a/config/sitemap.rb b/config/sitemap.rb new file mode 100644 index 00000000..5b30e9c0 --- /dev/null +++ b/config/sitemap.rb @@ -0,0 +1,241 @@ +# frozen_string_literal: true + +# --------------------------------------------------------------------------- +# Sitemap configuration for participa.gencat.cat +# +# Groups and their content: +# processes – participatory process landing pages + component pages +# regulations – /regulations index + regulation process + component pages +# assemblies – assembly landing pages + component pages +# meetings – individual meeting show pages (all spaces) +# proposals – individual proposal show pages (all spaces) +# pages – homepage, listing pages, static CMS pages +# blogs – individual blog post show pages (all spaces) +# debates – individual debate show pages (all spaces) +# budgets – individual budget project show pages (all spaces) +# accountability – individual accountability result show pages (all spaces) +# attachments – PDF and image file blob URLs +# +# Generated files land in public/sitemaps/. +# The master index is public/sitemaps/sitemap.xml.gz. +# --------------------------------------------------------------------------- + +# Make Rails route helpers (incl. rails_blob_path) available inside the +# create block and all group blocks. +SitemapGenerator::Interpreter.send(:include, Rails.application.routes.url_helpers) + +# Helper: returns the URL prefix for a participatory space, or nil for +# space types that are not publicly routable. +SitemapGenerator::Interpreter.class_eval do + def space_url_prefix(space) + if space.is_a?(Decidim::ParticipatoryProcess) + "/processes" + elsif space.is_a?(Decidim::Assembly) + "/assemblies" + end + end +end + +organization = Decidim::Organization.first +host = "https://#{organization.host}" +regulation_group_id = Rails.application.config.regulation + +SitemapGenerator::Sitemap.default_host = host +SitemapGenerator::Sitemap.sitemaps_path = "sitemaps/" +SitemapGenerator::Sitemap.create_index = true +SitemapGenerator::Sitemap.compress = true +SitemapGenerator::Sitemap.include_root = false +SitemapGenerator::Sitemap.include_index = false + +SitemapGenerator::Sitemap.create do + # ── Participatory Processes (excluding regulations) ──────────────────────── + group(filename: :processes) do + Decidim::ParticipatoryProcess + .where(organization:) + .published + .where("decidim_participatory_process_group_id IS NULL OR decidim_participatory_process_group_id != ?", regulation_group_id) + .preload(:components) + .find_each do |process| + next if process.slug.blank? + + add "/processes/#{process.slug}", + lastmod: process.updated_at, changefreq: "weekly", priority: 0.8 + + process.components.select { |c| c.published_at.present? }.each do |component| + add "/processes/#{process.slug}/f/#{component.id}", + lastmod: component.updated_at, changefreq: "weekly", priority: 0.6 + end + end + end + + # ── Regulations ───────────────────────────────────────────────────────────── + group(filename: :regulations) do + add "/regulations", changefreq: "weekly", priority: 0.7 + + Decidim::ParticipatoryProcess + .where(organization:, decidim_participatory_process_group_id: regulation_group_id) + .published + .preload(:components) + .find_each do |process| + next if process.slug.blank? + + add "/processes/#{process.slug}", + lastmod: process.updated_at, changefreq: "monthly", priority: 0.7 + + process.components.select { |c| c.published_at.present? }.each do |component| + add "/processes/#{process.slug}/f/#{component.id}", + lastmod: component.updated_at, changefreq: "monthly", priority: 0.5 + end + end + end + + # ── Assemblies ────────────────────────────────────────────────────────────── + group(filename: :assemblies) do + Decidim::Assembly + .where(organization:) + .published + .preload(:components) + .find_each do |assembly| + next if assembly.slug.blank? + + add "/assemblies/#{assembly.slug}", + lastmod: assembly.updated_at, changefreq: "weekly", priority: 0.8 + + assembly.components.select { |c| c.published_at.present? }.each do |component| + add "/assemblies/#{assembly.slug}/f/#{component.id}", + lastmod: component.updated_at, changefreq: "weekly", priority: 0.6 + end + end + end + + # ── Meetings ──────────────────────────────────────────────────────────────── + group(filename: :meetings) do + Decidim::Meetings::Meeting + .joins(:component) + .where.not(decidim_components: { published_at: nil }) + .where.not(published_at: nil) + .preload(component: :participatory_space) + .find_each do |meeting| + space = meeting.component.participatory_space + prefix = space_url_prefix(space) + next unless prefix && space&.slug.present? + + add "#{prefix}/#{space.slug}/f/#{meeting.decidim_component_id}/meetings/#{meeting.id}", + lastmod: meeting.updated_at, changefreq: "weekly", priority: 0.5 + end + end + + # ── Proposals ─────────────────────────────────────────────────────────────── + group(filename: :proposals) do + Decidim::Proposals::Proposal + .joins(:component) + .where.not(decidim_components: { published_at: nil }) + .where.not(published_at: nil) + .preload(component: :participatory_space) + .find_each do |proposal| + space = proposal.component.participatory_space + prefix = space_url_prefix(space) + next unless prefix && space&.slug.present? + + add "#{prefix}/#{space.slug}/f/#{proposal.decidim_component_id}/proposals/#{proposal.id}", + lastmod: proposal.updated_at, changefreq: "monthly", priority: 0.5 + end + end + + # ── Static Pages ──────────────────────────────────────────────────────────── + group(filename: :pages) do + add "/", changefreq: "daily", priority: 1.0 + add "/processes", changefreq: "daily", priority: 0.7 + add "/assemblies", changefreq: "daily", priority: 0.7 + + Decidim::StaticPage + .where(organization:) + .find_each do |page| + add "/pages/#{page.slug}", + lastmod: page.updated_at, changefreq: "monthly", priority: 0.4 + end + end + + # ── Blog Posts ────────────────────────────────────────────────────────────── + group(filename: :blogs) do + Decidim::Blogs::Post + .joins(:component) + .where.not(decidim_components: { published_at: nil }) + .preload(component: :participatory_space) + .find_each do |post| + space = post.component.participatory_space + prefix = space_url_prefix(space) + next unless prefix && space&.slug.present? + + add "#{prefix}/#{space.slug}/f/#{post.decidim_component_id}/posts/#{post.id}", + lastmod: post.updated_at, changefreq: "monthly", priority: 0.5 + end + end + + # ── Debates ───────────────────────────────────────────────────────────────── + group(filename: :debates) do + Decidim::Debates::Debate + .joins(:component) + .where.not(decidim_components: { published_at: nil }) + .preload(component: :participatory_space) + .find_each do |debate| + space = debate.component.participatory_space + prefix = space_url_prefix(space) + next unless prefix && space&.slug.present? + + add "#{prefix}/#{space.slug}/f/#{debate.decidim_component_id}/debates/#{debate.id}", + lastmod: debate.updated_at, changefreq: "weekly", priority: 0.5 + end + end + + # ── Budget Projects ───────────────────────────────────────────────────────── + group(filename: :budgets) do + Decidim::Budgets::Project + .joins(budget: :component) + .where.not(decidim_components: { published_at: nil }) + .preload(budget: { component: :participatory_space }) + .find_each do |project| + space = project.budget.component.participatory_space + prefix = space_url_prefix(space) + next unless prefix && space&.slug.present? + + add "#{prefix}/#{space.slug}/f/#{project.budget.decidim_component_id}" \ + "/budgets/#{project.decidim_budgets_budget_id}/projects/#{project.id}", + lastmod: project.updated_at, changefreq: "monthly", priority: 0.4 + end + end + + # ── Accountability Results ─────────────────────────────────────────────────── + group(filename: :accountability) do + Decidim::Accountability::Result + .joins(:component) + .where.not(decidim_components: { published_at: nil }) + .preload(component: :participatory_space) + .find_each do |result| + space = result.component.participatory_space + prefix = space_url_prefix(space) + next unless prefix && space&.slug.present? + + add "#{prefix}/#{space.slug}/f/#{result.decidim_component_id}/results/#{result.id}", + lastmod: result.updated_at, changefreq: "monthly", priority: 0.4 + end + end + + # ── Attachments (PDFs and images) ─────────────────────────────────────────── + # Adds blob redirect URLs so search engines can index uploaded documents and + # images. The ActiveStorage redirect path is stable (the signed_id does not + # expire). + group(filename: :attachments) do + Decidim::Attachment + .where("content_type LIKE 'image/%' OR content_type = 'application/pdf'") + .find_each do |attachment| + next unless attachment.file.attached? + + blob_path = rails_blob_path(attachment.file, only_path: true) + + add blob_path, + changefreq: "yearly", + priority: attachment.content_type == "application/pdf" ? 0.4 : 0.3 + end + end +end diff --git a/lib/tasks/sitemap.rake b/lib/tasks/sitemap.rake new file mode 100644 index 00000000..17d7f98d --- /dev/null +++ b/lib/tasks/sitemap.rake @@ -0,0 +1,52 @@ +# frozen_string_literal: true + +namespace :sitemap do + desc "Generate sitemaps, purge stale files, and update robots.txt" + task generate: :environment do + unless Rails.env.production? + puts "Sitemap generation is intended for production environments only. Aborting." + exit(-1) + end + + sitemaps_dir = Rails.public_path.join("sitemaps") + + # ── Step 1: Generate ──────────────────────────────────────────────────── + generation_started_at = Time.current + + SitemapGenerator::Sitemap.verbose = true + load Rails.root.join("config", "sitemap.rb") + + # ── Step 2: Purge stale files ──────────────────────────────────────────── + # Any sitemap file whose mtime predates this run was produced by a previous + # generation and is no longer valid (e.g. a group that produced N numbered + # files before now only produces N-1). + stale = Dir.glob(sitemaps_dir.join("*.xml.gz")).select do |file| + File.mtime(file) < generation_started_at + end + + stale.each do |file| + File.delete(file) + puts "Removed stale sitemap: #{File.basename(file)}" + end + + # ── Step 3: Rebuild robots.txt ─────────────────────────────────────────── + organization = Decidim::Organization.first + host = "https://#{organization.host}" + + new_sitemap_files = Dir.glob(sitemaps_dir.join("*.xml.gz")) + sitemap_directives = new_sitemap_files.map do |file| + "Sitemap: #{host}/sitemaps/#{File.basename(file)}" + end + + robots_path = Rails.public_path.join("robots.txt") + current_content = File.read(robots_path) + + # Strip any existing Sitemap: lines (including the trailing newline of each) + cleaned_content = current_content.gsub(/^Sitemap:.*\n?/, "").rstrip + + new_content = "#{cleaned_content}\n\n#{sitemap_directives.join("\n")}\n" + File.write(robots_path, new_content) + + puts "robots.txt updated with #{sitemap_directives.size} Sitemap directives." + end +end diff --git a/spec/factories.rb b/spec/factories.rb index fea89b12..ce0ba5a2 100644 --- a/spec/factories.rb +++ b/spec/factories.rb @@ -4,6 +4,11 @@ require "decidim/participatory_processes/test/factories" require "decidim/proposals/test/factories" require "decidim/meetings/test/factories" +require "decidim/assemblies/test/factories" +require "decidim/blogs/test/factories" +require "decidim/debates/test/factories" +require "decidim/budgets/test/factories" +require "decidim/accountability/test/factories" FactoryBot.define do factory :external_author, class: "Decidim::ExternalAuthor" do diff --git a/spec/lib/sitemap_spec.rb b/spec/lib/sitemap_spec.rb new file mode 100644 index 00000000..28a5a8c6 --- /dev/null +++ b/spec/lib/sitemap_spec.rb @@ -0,0 +1,368 @@ +# frozen_string_literal: true + +require "rails_helper" +require "zlib" +require "uri" + +describe "config/sitemap.rb" do + let!(:organization) { create(:organization) } + let(:tmpdir) { Pathname.new(Dir.mktmpdir) } + + # Save and restore SitemapGenerator state around each example so that the + # global singleton does not leak settings between tests. + around do |example| + orig_public_path = SitemapGenerator::Sitemap.public_path + orig_verbose = SitemapGenerator::Sitemap.verbose + + FileUtils.mkdir_p(tmpdir.join("sitemaps")) + SitemapGenerator::Sitemap.public_path = tmpdir + SitemapGenerator::Sitemap.verbose = false + + example.run + ensure + FileUtils.rm_rf(tmpdir) + SitemapGenerator::Sitemap.public_path = orig_public_path + SitemapGenerator::Sitemap.verbose = orig_verbose + end + + # Loads config/sitemap.rb, which sets options on SitemapGenerator::Sitemap + # and immediately runs SitemapGenerator::Sitemap.create, writing .xml.gz + # files to tmpdir/sitemaps/. + def generate_sitemap! + load Rails.root.join("config", "sitemap.rb") + end + + # Returns the path portion of every URL found across all non-index sitemap + # files produced by the last generate_sitemap! call. + def sitemap_paths + Dir.glob(tmpdir.join("sitemaps", "*.xml.gz")) + .reject { |f| File.basename(f) == "sitemap.xml.gz" } + .flat_map do |path| + xml = Zlib::GzipReader.open(path, &:read) + Nokogiri::XML(xml) + .xpath("//sm:loc", "sm" => "http://www.sitemaps.org/schemas/sitemap/0.9") + .map { |node| URI.parse(node.text).path } + end + end + + # Returns the path portion of every URL found in a specific named sub-sitemap. + # filename should be a Symbol or String matching the group filename (e.g. :processes). + def sitemap_paths_for(filename) + path = tmpdir.join("sitemaps", "#{filename}.xml.gz") + return [] unless File.exist?(path) + + xml = Zlib::GzipReader.open(path, &:read) + Nokogiri::XML(xml) + .xpath("//sm:loc", "sm" => "http://www.sitemaps.org/schemas/sitemap/0.9") + .map { |node| URI.parse(node.text).path } + end + + # ── Participatory Processes ────────────────────────────────────────────────── + + describe "processes group" do + let!(:process) { create(:participatory_process, :published, :with_steps, organization:) } + let!(:published_component) { create(:component, :published, participatory_space: process) } + let!(:unpublished_component) { create(:component, :unpublished, participatory_space: process) } + let!(:unpublished_process) { create(:participatory_process, organization:, published_at: nil) } + let!(:regulation_process) do + create(:participatory_process, :published, :with_steps, organization:, + decidim_participatory_process_group_id: Rails.application.config.regulation) + end + + before { generate_sitemap! } + + it "includes the published process landing page" do + expect(sitemap_paths).to include("/processes/#{process.slug}") + end + + it "includes published component pages for the process" do + expect(sitemap_paths).to include("/processes/#{process.slug}/f/#{published_component.id}") + end + + it "excludes unpublished component pages" do + expect(sitemap_paths).not_to include("/processes/#{process.slug}/f/#{unpublished_component.id}") + end + + it "excludes unpublished processes" do + expect(sitemap_paths).not_to include("/processes/#{unpublished_process.slug}") + end + + it "excludes regulation processes from the processes group" do + expect(sitemap_paths_for(:processes)).not_to include("/processes/#{regulation_process.slug}") + end + end + + # ── Regulations ────────────────────────────────────────────────────────────── + + describe "regulations group" do + let!(:regulation_process) do + create(:participatory_process, :published, :with_steps, organization:, + decidim_participatory_process_group_id: Rails.application.config.regulation) + end + let!(:regulation_component) do + create(:component, :published, participatory_space: regulation_process) + end + + before { generate_sitemap! } + + it "includes the /regulations listing page" do + expect(sitemap_paths).to include("/regulations") + end + + it "includes the regulation process page" do + expect(sitemap_paths).to include("/processes/#{regulation_process.slug}") + end + + it "includes the regulation process component page" do + expect(sitemap_paths).to include("/processes/#{regulation_process.slug}/f/#{regulation_component.id}") + end + end + + # ── Assemblies ─────────────────────────────────────────────────────────────── + + describe "assemblies group" do + let!(:assembly) { create(:assembly, :published, organization:) } + let!(:published_component) { create(:component, :published, participatory_space: assembly) } + let!(:unpublished_assembly) { create(:assembly, :unpublished, organization:) } + + before { generate_sitemap! } + + it "includes the published assembly landing page" do + expect(sitemap_paths).to include("/assemblies/#{assembly.slug}") + end + + it "includes published component pages for the assembly" do + expect(sitemap_paths).to include("/assemblies/#{assembly.slug}/f/#{published_component.id}") + end + + it "excludes unpublished assemblies" do + expect(sitemap_paths).not_to include("/assemblies/#{unpublished_assembly.slug}") + end + end + + # ── Meetings ───────────────────────────────────────────────────────────────── + + describe "meetings group" do + let!(:process) { create(:participatory_process, :published, :with_steps, organization:) } + let!(:meetings_component) { create(:meeting_component, :published, participatory_space: process) } + let!(:meeting) { create(:meeting, :published, component: meetings_component) } + let!(:unpublished_meetings_component) do + create(:meeting_component, :unpublished, participatory_space: process) + end + let!(:meeting_in_unpublished_component) do + create(:meeting, :published, component: unpublished_meetings_component) + end + + before { generate_sitemap! } + + it "includes meetings from published components" do + expect(sitemap_paths).to include( + "/processes/#{process.slug}/f/#{meetings_component.id}/meetings/#{meeting.id}" + ) + end + + it "excludes meetings from unpublished components" do + expect(sitemap_paths).not_to include( + "/processes/#{process.slug}/f/#{unpublished_meetings_component.id}/meetings/#{meeting_in_unpublished_component.id}" + ) + end + end + + # ── Proposals ──────────────────────────────────────────────────────────────── + + describe "proposals group" do + let!(:process) { create(:participatory_process, :published, :with_steps, organization:) } + let!(:proposals_component) { create(:proposal_component, :published, participatory_space: process) } + let!(:proposal) { create(:proposal, :published, component: proposals_component) } + let!(:draft_proposal) { create(:proposal, :draft, component: proposals_component) } + + before { generate_sitemap! } + + it "includes published proposals" do + expect(sitemap_paths).to include( + "/processes/#{process.slug}/f/#{proposals_component.id}/proposals/#{proposal.id}" + ) + end + + it "excludes draft (unpublished) proposals" do + expect(sitemap_paths).not_to include( + "/processes/#{process.slug}/f/#{proposals_component.id}/proposals/#{draft_proposal.id}" + ) + end + end + + # ── Static Pages ───────────────────────────────────────────────────────────── + + describe "pages group" do + let!(:static_page) { create(:static_page, organization:) } + + before { generate_sitemap! } + + it "includes the root URL" do + # SitemapGenerator normalises the root to the bare host (no trailing slash), + # so URI.parse gives path "". Accept both forms to be safe. + expect(sitemap_paths & ["", "/"]).not_to be_empty + end + + it "includes the processes listing" do + expect(sitemap_paths).to include("/processes") + end + + it "includes the assemblies listing" do + expect(sitemap_paths).to include("/assemblies") + end + + it "includes static CMS pages" do + expect(sitemap_paths).to include("/pages/#{static_page.slug}") + end + end + + # ── Blog Posts ─────────────────────────────────────────────────────────────── + + describe "blogs group" do + let!(:process) { create(:participatory_process, :published, :with_steps, organization:) } + let!(:post_component) { create(:post_component, :published, participatory_space: process) } + let!(:post) { create(:post, component: post_component) } + + before { generate_sitemap! } + + it "includes published blog posts" do + expect(sitemap_paths).to include( + "/processes/#{process.slug}/f/#{post_component.id}/posts/#{post.id}" + ) + end + end + + # ── Debates ────────────────────────────────────────────────────────────────── + + describe "debates group" do + let!(:process) { create(:participatory_process, :published, :with_steps, organization:) } + let!(:debates_component) { create(:debates_component, :published, participatory_space: process) } + let!(:debate) { create(:debate, component: debates_component) } + + before { generate_sitemap! } + + it "includes debates from published components" do + expect(sitemap_paths).to include( + "/processes/#{process.slug}/f/#{debates_component.id}/debates/#{debate.id}" + ) + end + end + + # ── Budget Projects ─────────────────────────────────────────────────────────── + + describe "budgets group" do + let!(:process) { create(:participatory_process, :published, :with_steps, organization:) } + let!(:budgets_component) { create(:budgets_component, :published, participatory_space: process) } + let!(:budget) { create(:budget, component: budgets_component) } + let!(:project) { create(:project, budget:) } + + before { generate_sitemap! } + + it "includes budget projects from published components" do + expect(sitemap_paths).to include( + "/processes/#{process.slug}/f/#{budgets_component.id}/budgets/#{budget.id}/projects/#{project.id}" + ) + end + end + + # ── Accountability Results ──────────────────────────────────────────────────── + + describe "accountability group" do + let!(:process) { create(:participatory_process, :published, :with_steps, organization:) } + let!(:accountability_component) do + create(:accountability_component, :published, participatory_space: process) + end + let!(:result) { create(:result, component: accountability_component) } + + before { generate_sitemap! } + + it "includes accountability results from published components" do + expect(sitemap_paths).to include( + "/processes/#{process.slug}/f/#{accountability_component.id}/results/#{result.id}" + ) + end + end + + # ── Attachments ─────────────────────────────────────────────────────────────── + + describe "attachments group" do + let!(:process) { create(:participatory_process, :published, :with_steps, organization:) } + + context "with an image attachment" do + let!(:attachment) do + create(:attachment, attached_to: process, content_type: "image/jpeg") + end + + before { generate_sitemap! } + + it "includes the ActiveStorage blob path for the image" do + blob_path = Rails.application.routes.url_helpers.rails_blob_path(attachment.file, only_path: true) + expect(sitemap_paths).to include(blob_path) + end + end + + context "with a PDF attachment" do + let!(:attachment) do + create(:attachment, attached_to: process, + content_type: "application/pdf", + file: Decidim::Dev.test_file("Exampledocument.pdf", "application/pdf")) + end + + before { generate_sitemap! } + + it "includes the ActiveStorage blob path for the PDF" do + blob_path = Rails.application.routes.url_helpers.rails_blob_path(attachment.file, only_path: true) + expect(sitemap_paths).to include(blob_path) + end + end + + context "with a plain-text attachment" do + let!(:other_attachment) do + # Create an attachment with a non-image, non-PDF content type by directly + # updating the column after creation (factory always creates images). + att = create(:attachment, attached_to: process) + att.update_column(:content_type, "text/plain") # rubocop:disable Rails/SkipsModelValidations + att + end + + before { generate_sitemap! } + + it "excludes non-image, non-PDF attachments" do + blob_path = Rails.application.routes.url_helpers.rails_blob_path(other_attachment.file, only_path: true) + expect(sitemap_paths).not_to include(blob_path) + end + end + end + + # ── Sitemap index ───────────────────────────────────────────────────────────── + + describe "sitemap index" do + before { generate_sitemap! } + + it "creates a sitemap index file" do + expect(File.exist?(tmpdir.join("sitemaps", "sitemap.xml.gz"))).to be true + end + + it "creates at least one sub-sitemap per content group" do + sub_sitemaps = Dir.glob(tmpdir.join("sitemaps", "*.xml.gz")) + .reject { |f| File.basename(f) == "sitemap.xml.gz" } + expect(sub_sitemaps.count).to be >= 1 + end + + it "index references all generated sub-sitemap files" do + sub_sitemaps = Dir.glob(tmpdir.join("sitemaps", "*.xml.gz")) + .reject { |f| File.basename(f) == "sitemap.xml.gz" } + .map { |f| File.basename(f) } + + index_xml = Zlib::GzipReader.open(tmpdir.join("sitemaps", "sitemap.xml.gz"), &:read) + index_locs = Nokogiri::XML(index_xml) + .xpath("//sm:loc", "sm" => "http://www.sitemaps.org/schemas/sitemap/0.9") + .map { |node| File.basename(node.text) } + + sub_sitemaps.each do |filename| + expect(index_locs).to include(filename) + end + end + end +end diff --git a/spec/lib/tasks/sitemap_rake_spec.rb b/spec/lib/tasks/sitemap_rake_spec.rb new file mode 100644 index 00000000..3a0d2df4 --- /dev/null +++ b/spec/lib/tasks/sitemap_rake_spec.rb @@ -0,0 +1,125 @@ +# frozen_string_literal: true + +require "rails_helper" + +describe "sitemap:generate rake task" do + subject(:task) { Rake::Task["sitemap:generate"] } + + before do + Rails.application.load_tasks unless Rake::Task.task_defined?("sitemap:generate") + task.reenable + end + + # ── Production guard ──────────────────────────────────────────────────────── + + describe "production guard" do + before { allow(Rails.env).to receive(:production?).and_return(false) } + + it "exits when not running in production" do + expect { task.invoke }.to raise_error(SystemExit) + end + end + + # ── Production behaviour ──────────────────────────────────────────────────── + + context "when running in production" do + let!(:organization) { create(:organization) } + let(:tmpdir) { Pathname.new(Dir.mktmpdir) } + + # rspec-mocks stubs must be set up in before/it/after, not in around. + before do + allow(Rails.env).to receive(:production?).and_return(true) + allow(Rails).to receive(:public_path).and_return(tmpdir) + end + + # File-system isolation and SitemapGenerator state go in around so that + # cleanup runs even when an example raises. + around do |example| + orig_public_path = SitemapGenerator::Sitemap.public_path + orig_verbose = SitemapGenerator::Sitemap.verbose + + FileUtils.mkdir_p(tmpdir.join("sitemaps")) + File.write(tmpdir.join("robots.txt"), initial_robots_txt) + + SitemapGenerator::Sitemap.public_path = tmpdir + SitemapGenerator::Sitemap.verbose = false + + example.run + ensure + FileUtils.rm_rf(tmpdir) + SitemapGenerator::Sitemap.public_path = orig_public_path + SitemapGenerator::Sitemap.verbose = orig_verbose + end + + let(:initial_robots_txt) do + <<~ROBOTS + # See http://www.robotstxt.org/robotstxt.html + + User-agent: * + Disallow: /admin/ + Allow: /processes/ + Sitemap: https://participa.gencat.cat/sitemaps/old_sitemap.xml.gz + ROBOTS + end + + # ── Stale file purge ────────────────────────────────────────────────────── + + describe "stale file cleanup" do + let(:stale_path) { tmpdir.join("sitemaps", "stale_processes.xml.gz") } + + before do + # Create a file that predates the generation run. + # File.utime requires Integer/Float timestamps in Ruby 3.1+. + FileUtils.touch(stale_path) + File.utime(1.hour.ago.to_i, 1.hour.ago.to_i, stale_path) + end + + it "removes sitemap files that predate the current generation run" do + task.invoke + expect(File.exist?(stale_path)).to be false + end + + it "keeps sitemap files that were written during the current generation run" do + task.invoke + fresh_files = Dir.glob(tmpdir.join("sitemaps", "*.xml.gz")) + expect(fresh_files).not_to be_empty + end + end + + # ── robots.txt update ───────────────────────────────────────────────────── + + describe "robots.txt update" do + before { task.invoke } + + let(:robots_content) { File.read(tmpdir.join("robots.txt")) } + + it "removes stale Sitemap: lines from the previous run" do + expect(robots_content).not_to include("old_sitemap.xml.gz") + end + + it "adds a Sitemap: directive for every generated file" do + generated = Dir.glob(tmpdir.join("sitemaps", "*.xml.gz")).map { |f| File.basename(f) } + expect(generated).not_to be_empty + + generated.each do |filename| + expect(robots_content).to include("Sitemap: https://#{organization.host}/sitemaps/#{filename}") + end + end + + it "preserves User-agent rules" do + expect(robots_content).to include("User-agent: *") + expect(robots_content).to include("Disallow: /admin/") + expect(robots_content).to include("Allow: /processes/") + end + + it "does not duplicate Sitemap: entries when run multiple times" do + task.reenable + task.invoke + + sitemap_lines = File.readlines(tmpdir.join("robots.txt")) + .select { |l| l.start_with?("Sitemap:") } + expect(sitemap_lines.uniq).to eq(sitemap_lines) + end + end + end +end