Simple parsing module on Drupal 9

Today's task is to make a small module for Drupal 9 with the help of which it would be possible to parse articles and create pages with the text of these articles on our website.

The module should have a simple settings page, where it is possible to kill the article source page for parsing, where we will set the template for article search. Also, on the settings page, we will choose how many articles will be parsed, what type of materials will be created on our site, what text format to choose, and whether the pages will be published after creation.

First, we create a folder of our module - simpleparser in the module/custom, with that structure:

simpleparser
    src
        Controller
            SimpleParserController.php
        Form
            SimpleParserSettingsForm.php
    simpleparser.info.yml
    simpleparser.links.menu.yml
    simpleparser.module
    simpleparser.routing.yml
    simpleparser.services.yml

Now let's see what should be in our files.

simpleparser.services.yml

services:
  simpleparser.custom_controller:
    class: Drupal\simpleparser\Controller\SimpleParserController

simpleparser.routing.yml

simpleparser.settings:
  path: '/admin/config/system/simpleparser'
  defaults:
    _form: '\Drupal\simpleparser\Form\SimpleParserSettingsForm'
    _title: 'SimpleParser Settings'
  requirements:
    _permission: 'administer site configuration'

simpleparser.info.yml

name: SimpleParser
type: module
description: 'Simple parser for Drupal 9'
core_version_requirement: ^8 || ^9

simpleparser.links.menu.yml

simpleparser.admin_settings:
  title: 'SimpleParser Settings'
  description: 'Configure Parser module settings.'
  parent: system.admin_config_system
  route_name: simpleparser.settings
  weight: 100

simpleparser.module

<?php
use Drupal\Core\Routing\RouteSubscriberBase;
use Symfony\Component\Routing\RouteCollection;
/**
 * Implements hook_cron().
 */
function simpleparser_cron() {
    $config = \Drupal::config('simpleparser.settings');
    if ($config->get('parsing_mode') === 'cron') {
        $controller = \Drupal::service('simpleparser.custom_controller');
        $controller->checkNewArticles();
    }
}
/**
 * Route subscriber for simpleparser.
 */
class SimpleParserRouteSubscriber extends RouteSubscriberBase {
    /**
     * {@inheritdoc}
     */
    protected function alterRoutes(RouteCollection $collection) {
        // Custom logic for routing if needed.
    }
}

src/Controller/SimpleParserController.php

<?php
namespace Drupal\simpleparser\Controller;
use GuzzleHttp\Client;
use Drupal\Core\Controller\ControllerBase;
use Drupal\node\Entity\Node;
class SimpleParserController extends ControllerBase {
    public function checkNewArticles() {
        \Drupal::messenger()->addMessage($this->t('Parsing has started.'));
        $client = new Client();
        $sourceUrl = $this->config('simpleparser.settings')->get('source_url');
        $articlesLimit = $this->config('simpleparser.settings')->get('articles_limit');
        $selectedContentType = $this->config('simpleparser.settings')->get('content_type');
        $xpathQuery = $this->config('simpleparser.settings')->get('xpath_articles');
        $parsedUrl = parse_url($sourceUrl);
        $baseUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'];
        $response = $client->request('GET', $sourceUrl);
        $html = $response->getBody()->getContents();
        $dom = new \DOMDocument();
        @$dom->loadHTML($html);
        $xpath = new \DOMXPath($dom);
        $articles = $xpath->query($xpathQuery);
        $articlesToProcess = [];
        $counter = 0;
        foreach ($articles as $article) {
            if ($counter >= $articlesLimit) {
                break;
            }
            $title = $article->nodeValue;
            $link = $article->getAttribute('href');
            if (!preg_match('~^(?:f|ht)tps?://~i', $link)) {
                $link = rtrim($baseUrl, '/') . '/' . ltrim($link, '/');
            }
            $query = \Drupal::entityQuery('node')
                ->condition('type', $selectedContentType)
                //->condition('title', $title)
                ->condition('field_import_title', $title)
                ->range(0, 1);
            $result = $query->execute();
            if (empty($result)) {
                $articlesToProcess[] = ['title' => $title, 'link' => $link];
            }
            $counter++;
        }
        $this->processArticle($articlesToProcess);
        $uniqueArticlesCount = count($articlesToProcess);
        \Drupal::messenger()->addMessage($this->t('Number of unique articles to process: @count', ['@count' => $uniqueArticlesCount]));
        return count($articlesToProcess);
    }
    private function processArticle($articlesToProcess) {
        $client = new Client();
        $createdArticlesCount = 0;
        $xpathArticleContent = $this->config('simpleparser.settings')->get('xpath_article_content');
        foreach ($articlesToProcess as $articleData) {
            $title = $articleData['title'];
            $relativeLink = $articleData['link'];
            $response = $client->request('GET', $relativeLink);
            $articleHtml = $response->getBody()->getContents();
            $articleDom = new \DOMDocument();
            @$articleDom->loadHTML($articleHtml);
            $articleXpath = new \DOMXPath($articleDom);
            $articleContentNodes = $articleXpath->query($xpathArticleContent);
            $articleContent = '';
            foreach ($articleContentNodes as $node) {
                $articleContent .= trim($node->textContent) . "\n";
            }
            $selectedContentType = $this->config('simpleparser.settings')->get('content_type');
            $selectedTextFormat = $this->config('simpleparser.settings')->get('text_format');
            $publishedStatus = $this->config('simpleparser.settings')->get('published_status');
            $node = Node::create([
                'type'  => $selectedContentType,
                'title' => $articleData['title'],
                'field_import_title' => $articleData['title'],
                'body'  => [
                    'value'  => $articleContent,
                    'format' => $selectedTextFormat,
                ],
                'status' => $publishedStatus,
            ]);
            if ($node->save()) {
                $createdArticlesCount++;
                \Drupal::messenger()->addMessage($this->t('Created page: @title', ['@title' => $articleData['title']]));
            }
        }
        \Drupal::messenger()->addMessage($this->t('Total number of articles created: @count', ['@count' => $createdArticlesCount]));
    }
}

A small explanation - during parsing, we will write the title of the article in the title field and field_import_title, we will do this so that after editing we can change the title on our site, but in the future, we will check if we already have the title of such an article - this article will not parse again.

src/Form/SimpleParserSettingsForm.php

<?php
namespace Drupal\simpleparser\Form;
use Drupal\Core\Form\ConfigFormBase;
use Drupal\Core\Form\FormStateInterface;
use Drupal\filter\Entity\FilterFormat;
class SimpleParserSettingsForm extends ConfigFormBase {
    /**
     * {@inheritdoc}
     */
    public function getFormId() {
        return 'simpleparser_settings_form';
    }
    /**
     * {@inheritdoc}
     */
    protected function getEditableConfigNames() {
        return ['simpleparser.settings'];
    }
    /**
     * {@inheritdoc}
     */
    public function buildForm(array $form, FormStateInterface $form_state) {
        $config = $this->config('simpleparser.settings');
        $form['source_url'] = [
            '#type' => 'textfield',
            '#title' => $this->t('Source URL'),
            '#description' => $this->t('Enter the URL of the page to parse articles from.'),
            '#default_value' => $config->get('source_url'),
            '#required' => TRUE,
        ];
        $form['xpath_articles'] = [
            '#type' => 'textfield',
            '#title' => $this->t('XPath for articles links'),
            '#description' => $this->t('Enter the XPath query to extract article links from the source page. Example: "//div[@class=\'list\']//a"'),
            '#default_value' => $config->get('xpath_articles'),
            '#required' => TRUE,
        ];
        $form['articles_limit'] = [
            '#type' => 'number',
            '#title' => $this->t('Number of articles to parse'),
            '#description' => $this->t('Enter the number of articles to be parsed.'),
            '#default_value' => $config->get('articles_limit'),
            '#min' => 1,
            '#max' => 50,
            '#required' => TRUE,
        ];
        $form['content_type'] = [
            '#type' => 'select',
            '#title' => $this->t('Content type'),
            '#description' => $this->t('Select the content type to save parsed articles.'),
            '#options' => node_type_get_names(),
            '#default_value' => $config->get('content_type'),
        ];
        $form['text_format'] = [
            '#type' => 'select',
            '#title' => $this->t('Text Format'),
            '#description' => $this->t('Select the text format for the articles.'),
            '#options' => $this->getTextFormatsOptions(),
            '#default_value' => $config->get('text_format'),
        ];
        $form['xpath_article_content'] = [
            '#type' => 'textfield',
            '#title' => $this->t('XPath for article content'),
            '#description' => $this->t('Enter the XPath query to extract the article content.'),
            '#default_value' => $config->get('xpath_article_content'),
            '#required' => TRUE,
        ];
        $form['published_status'] = [
            '#type' => 'checkbox',
            '#title' => $this->t('Publish articles by default'),
            '#description' => $this->t('If checked, all new articles will be published by default.'),
            '#default_value' => $config->get('published_status'),
        ];
        $form['parsing_mode'] = [
            '#type' => 'radios',
            '#title' => $this->t('Parsing Mode'),
            '#description' => $this->t('Choose how the parsing should be triggered.'),
            '#options' => [
                'manual' => $this->t('Manual - by clicking "Start parsing" button'),
                'cron' => $this->t('Automatic - once per day via cron'),
            ],
            '#default_value' => $config->get('parsing_mode'),
        ];
        $form['actions']['start_parsing'] = [
            '#type' => 'submit',
            '#value' => $this->t('Start Parsing'),
            '#submit' => ['::startParsingSubmit'],
            '#button_type' => 'primary',
        ];
        return parent::buildForm($form, $form_state);
    }
    private function getTextFormatsOptions() {
        $formats = FilterFormat::loadMultiple();
        $options = [];
        foreach ($formats as $format) {
            $options[$format->id()] = $format->label();
        }
        return $options;
    }
    public function validateForm(array &$form, FormStateInterface $form_state) {
        parent::validateForm($form, $form_state);
        $url = $form_state->getValue('source_url');
        if (!filter_var($url, FILTER_VALIDATE_URL)) {
            $form_state->setErrorByName('source_url', $this->t('The URL is not valid.'));
        }
    }
    /**
     * {@inheritdoc}
     */
    public function submitForm(array &$form, FormStateInterface $form_state) {
        $contentType = $form_state->getValue('content_type');
        $textFormat = $form_state->getValue('text_format');
        $this->config('simpleparser.settings')
            ->set('source_url', $form_state->getValue('source_url'))
            ->set('xpath_articles', $form_state->getValue('xpath_articles'))
            ->set('content_type', $contentType)
            ->set('text_format', $textFormat)
            ->set('xpath_article_content', $form_state->getValue('xpath_article_content'))
            ->set('published_status', $form_state->getValue('published_status'))
            ->set('articles_limit', $form_state->getValue('articles_limit'))
            ->set('parsing_mode', $form_state->getValue('parsing_mode'))
            ->save();
        parent::submitForm($form, $form_state);
    }
    public function startParsingSubmit(array &$form, FormStateInterface $form_state) {
        \Drupal::messenger()->addMessage($this->t('Manual parsing has started.'));
        \Drupal::service('simpleparser.custom_controller')->checkNewArticles();
        $form_state->setRebuild(TRUE);
    }
}

Finally, let's take a look at why we need article list templates for parsing and the article itself and how to get them.

Suppose we have a page with a list of articles that we need to parse. But the code of the page looks something like this:

<table>
	<tbody>
		<td class="td_main_center">
			<div class="list">
				<a href="article_link">Article Title</a>
			</div>
			<div class="list">
				<a href="article_link">Article Title</a>
			</div>
			<div class="list">
				<a href="article_link">Article Title</a>
			</div>
			<div class="list">
				<a href="article_link">Article Title</a>
			</div>
			<div class="list">
				<a href="article_link">Article Title</a>
			</div>
			<div class="list">
				<a href="article_link">Article Title</a>
			</div>
		</td>
	</tbody>
</table>

And for the first step of parsing, we need to get the links to all the articles. The template for settings in this case will be as follows:

//td[@id="td_main_center"]//div[@class="list"]//a

Next, after receiving the links to the articles we need, we need to expand the content of this page. Again, we open the article page for parsing and see that the approximate content template looks like this:

<table>
	<tbody>
		<td id="td_main_center">
			<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.</p>
			<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.</p>
			<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.</p>
			<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.</p>
		</td>
	</tbody>
</table>

So our template for the article itself will be:

//td[@id="td_main_center"]//p

Plain text

  • No HTML tags allowed.
  • Lines and paragraphs break automatically.
  • Web page addresses and email addresses turn into links automatically.
The comment language code.