Simple html-code cleaner for Drupal 9 fields

Today we create a simple module that clean html-code in long formattet field of node from classes, id's, style, scripts, img and link. This will be quite appropriate if your content is imported after parsing from different sources and you do not have the opportunity to edit each node separately.

First, in module/custom create our module folder - dartharth_content_cleaner. Inside create file dartharth_content_cleaner.info.yml

name: 'Content Cleaner'
type: module
description: 'Provides a tool to clean HTML content of specified content types.'
core_version_requirement: ^9
package: DARTHARTH
dependencies:
  - drupal:node

dartharth_content_cleaner.links.menu.yml

dartharth_content_cleaner.admin_settings:
  title: 'DARTHARTH Content Cleaner Settings'
  description: 'Configure content cleaning options.'
  parent: system.admin_config_content
  route_name: dartharth_content_cleaner.admin_settings
  weight: 10

dartharth_content_cleaner.routing.yml

dartharth_content_cleaner.admin_settings:
  path: '/admin/config/dartharth-content-cleaner'
  defaults:
    _form: '\Drupal\dartharth_content_cleaner\Form\ContentCleanerSettingsForm'
    _title: 'DARTHARTH Content Cleaner Settings'
  requirements:
    _permission: 'administer site configuration'

Next in folder src/Form create file ContentCleanerSettingsForm.php

<?php

namespace Drupal\dartharth_content_cleaner\Form;

use Drupal\Core\Form\ConfigFormBase;
use Drupal\Core\Form\FormStateInterface;
use Drupal\Core\Messenger\MessengerInterface;
use Drupal\filter\Entity\FilterFormat;


class ContentCleanerSettingsForm extends ConfigFormBase {

    protected function getEditableConfigNames() {
        return ['dartharth_content_cleaner.settings'];
    }

    public function getFormId() {
        return 'dartharth_content_cleaner_settings_form';
    }

    public function buildForm(array $form, FormStateInterface $form_state) {
        // Get a list of content types.
        $content_types = \Drupal::entityTypeManager()->getStorage('node_type')->loadMultiple();
        $content_type_options = [];
        foreach ($content_types as $content_type) {
            $content_type_options[$content_type->id()] = $content_type->label();
        }

        // Add content type select element.
        $form['content_type'] = [
            '#type' => 'select',
            '#title' => $this->t('Content type'),
            '#options' => $content_type_options,
            '#required' => TRUE,
        ];

        // Get a list of text formats.
        $text_formats = FilterFormat::loadMultiple();
        $text_format_options = [];
        foreach ($text_formats as $text_format) {
            $text_format_options[$text_format->id()] = $text_format->label();
        }

        // Add text format select element.
        $form['text_format'] = [
            '#type' => 'select',
            '#title' => $this->t('Text format'),
            '#options' => $text_format_options,
            '#required' => TRUE,
            '#description' => $this->t('Select the text format for the cleaned content.'),
        ];

        // Add cleaning options fieldset.
        $form['cleaning_options'] = [
            '#type' => 'fieldset',
            '#title' => $this->t('Cleaning options'),
        ];

        // Add cleaning option checkboxes.
        $form['cleaning_options']['remove_scripts'] = [
            '#type' => 'checkbox',
            '#title' => $this->t('Remove scripts'),
        ];

        $form['cleaning_options']['remove_links'] = [
            '#type' => 'checkbox',
            '#title' => $this->t('Remove links'),
        ];

        $form['cleaning_options']['remove_images'] = [
            '#type' => 'checkbox',
            '#title' => $this->t('Remove images'),
        ];

        // Add submit button.
        $form['submit'] = [
            '#type' => 'submit',
            '#value' => $this->t('Start cleaning'),
        ];

        return parent::buildForm($form, $form_state);
    }


    public function submitForm(array &$form, FormStateInterface $form_state) {
        $content_type = $form_state->getValue('content_type');
        $remove_scripts = $form_state->getValue('remove_scripts');
        $remove_links = $form_state->getValue('remove_links');
        $remove_images = $form_state->getValue('remove_images');
        $content_filter = $form_state->getValue('text_format');

        // Get the list of nodes of the selected content type.
        $query = \Drupal::entityQuery('node')->condition('type', $content_type);
        $nids = $query->execute();
        $node_storage = \Drupal::entityTypeManager()->getStorage('node');

        // Counter for corrected nodes.
        $corrected_nodes = 0;

        foreach ($nids as $nid) {
            /** @var \Drupal\node\NodeInterface $node */
            $node = $node_storage->load($nid);
            if ($node) {
                // Clean up the node's body field.
                $body_value = $node->get('body')->value;
                $body_value_cleaned = $this->cleanUpHtml($body_value, $remove_scripts, $remove_links, $remove_images);
                $node->set('body', [
                    'value' => $body_value_cleaned,
                    'format' => $content_filter,
                ]);

                // Save the node.
                $node->save();

                // Increment the counter.
                $corrected_nodes++;
            }
        }


        // Set a message indicating the number of corrected nodes.
        $this->messenger()->addMessage($this->t('@count nodes have been cleaned.', ['@count' => $corrected_nodes]));
    }


    private function cleanUpHtml($html, $remove_scripts, $remove_links, $remove_images) {
        // Remove scripts if the option is selected.
        if ($remove_scripts) {
            $html = preg_replace('/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/', '', $html);
        }

        // Remove links if the option is selected.
        if ($remove_links) {
            $html = preg_replace('/<a\s+(?:[^>]*?\s+)?href="([^"]*)"([^>]*)>(.*?)<\/a>/', '$3', $html);
        }

        // Remove images if the option is selected.
        if ($remove_images) {
            $html = preg_replace('/<img\s+.*?>/', '', $html);
        }

        // Remove unnecessary attributes (classes, ids, styles).
        $html = preg_replace('/(<[^>]+) (class|id|style)=".*?"/i', '$1', $html);

        return $html;
    }

}

That's it. Now we need to enable module, and go to module admin page  /admin/config/dartharth-content-cleaner, where we can start clean node.

Link to GitHub

Plain text

  • No HTML tags allowed.
  • Lines and paragraphs break automatically.
  • Web page addresses and email addresses turn into links automatically.
The comment language code.