Skip to content
This repository has been archived by the owner on Dec 25, 2022. It is now read-only.

Commit

Permalink
tmp
Browse files Browse the repository at this point in the history
  • Loading branch information
wachterjohannes committed Jun 11, 2017
1 parent b4f818e commit f31a6bc
Show file tree
Hide file tree
Showing 11 changed files with 107 additions and 19 deletions.
2 changes: 2 additions & 0 deletions src/Component/Client/IndexInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,6 @@ public function search(Search $search, $type);
* @return array
*/
public function get($type, $id);

public function optimize();
}
8 changes: 8 additions & 0 deletions src/Component/Elasticsearch/ElasticsearchIndex.php
Original file line number Diff line number Diff line change
Expand Up @@ -105,4 +105,12 @@ public function get($type, $id)

return $response;
}

/**
* {@inheritdoc}
*/
public function optimize()
{
// nothing to optimize currently.
}
}
5 changes: 5 additions & 0 deletions src/Component/Pucene/Dbal/DbalStorage.php
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,11 @@ public function termStatistics()
return new DbalTermStatistics($this->connection, $this->getSchema());
}

public function optimize()
{
$this->persister->optimize();
}

public function getConnection()
{
return $this->connection;
Expand Down
55 changes: 48 additions & 7 deletions src/Component/Pucene/Dbal/DocumentPersister.php
Original file line number Diff line number Diff line change
Expand Up @@ -56,21 +56,51 @@ public function persist(Document $document, array $fields)
$token->getEncodedTerm(),
ElasticsearchPrecision::fieldNorm($field->getNumberOfTerms())
);

$this->connection->createQueryBuilder()
->update($this->schema->getDocumentTermsTableName())
->set('frequency', 'frequency + 1')
->andWhere('field_name = :fieldName')
->andWhere('term = :term')
->setParameter('fieldName', $field->getName())
->setParameter('term', $token->getEncodedTerm())
->execute();
}

// update term frequency
foreach ($fieldTerms as $term => $frequency) {
$this->connection->update(
$this->schema->getDocumentTermsTableName(),
[
'term_frequency' => sqrt($frequency),
],
['document_id' => $document->getId(), 'field_name' => $field->getName(), 'term' => $term]
);
$this->connection->createQueryBuilder()
->update($this->schema->getDocumentTermsTableName())
->set('term_frequency', sqrt($frequency))
->set('score', 'field_norm * ' . sqrt($frequency))
->andWhere('document_ID = :document')
->andWhere('field_name = :fieldName')
->andWhere('term = :term')
->setParameter('document', $document->getId())
->setParameter('fieldName', $field->getName())
->setParameter('term', $term)
->execute();
}
}
}

public function optimize()
{
// TODO recalculate term frequency

$docCount = $this->connection->createQueryBuilder()
->select('COUNT(id)')
->from($this->schema->getDocumentsTableName())
->execute()
->fetchColumn();

// calculate inverse-document-frequency
$this->connection->createQueryBuilder()
->update($this->schema->getDocumentTermsTableName())
->set('idf', '1 + log(' . $docCount . ' / (frequency + 1))')
->execute();
}

/**
* @param Document $document
*/
Expand Down Expand Up @@ -101,13 +131,24 @@ protected function insertDocument(Document $document)
*/
protected function insertToken(string $documentId, string $fieldName, $term, $fieldNorm)
{
$frequency = $this->connection->createQueryBuilder()
->select('frequency')
->from($this->schema->getDocumentTermsTableName())
->andWhere('field_name = :fieldName')
->andWhere('term = :term')
->setParameter('fieldName', $fieldName)
->setParameter('term', $term)
->execute()
->fetchColumn();

$this->connection->insert(
$this->schema->getDocumentTermsTableName(),
[
'document_id' => $documentId,
'field_name' => $fieldName,
'term' => $term,
'field_norm' => $fieldNorm,
'frequency' => $frequency ?: 0,
]
);
}
Expand Down
24 changes: 15 additions & 9 deletions src/Component/Pucene/Dbal/Interpreter/Element/TermInterpreter.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ public function interpret(ElementInterface $element, PuceneQueryBuilder $queryBu
$expr = $queryBuilder->expr();
$name = $queryBuilder->joinTerm($element->getField(), $element->getTerm());

$queryBuilder->addSelect(sprintf('(%1$s.term_frequency * %1$s.field_norm) as %1$sValue', $name));
$queryBuilder->addSelect(sprintf('%1$s.id as %1$sId', $name));
$queryBuilder->addSelect(sprintf('(%1$s.score) as %1$sValue', $name));
$queryBuilder->addSelect(sprintf('%1$s.idf as %1$sIdf', $name));

return $expr->isNotNull($name . '.id');
}
Expand All @@ -43,18 +43,24 @@ public function scoring(ElementInterface $element, ScoringAlgorithm $scoring, $q
*/
public function newScoring(ElementInterface $element, ScoringAlgorithm $scoring, array $row, $queryNorm = null)
{
$idf = $scoring->inverseDocumentFrequency($element);
$termName = 'term' . ucfirst($element->getField()) . ucfirst($element->getTerm());
$idfName = $termName . 'Idf';
$valueName = $termName . 'Value';
if (!array_key_exists($valueName, $row)
|| !array_key_exists($idfName, $row)
|| $row[$valueName] === null
|| $row[$idfName] === null
) {
return 0;
}

$idf = $row[$idfName];
$factor = $idf * $element->getBoost();
if ($queryNorm) {
$factor *= $idf * $queryNorm;
}

$termName = 'term' . ucfirst($element->getField()) . ucfirst($element->getTerm()) . 'Value';
if (!array_key_exists($termName, $row) || $row[$termName] === null) {
return 0;
}

return $row[$termName] * $factor;
return $row[$valueName] * $factor;
}

/**
Expand Down
3 changes: 3 additions & 0 deletions src/Component/Pucene/Dbal/PuceneSchema.php
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ private function createDocumentTermsTable()
$fields->addColumn('term', 'string', ['length' => 255]);
$fields->addColumn('term_frequency', 'float', ['default' => 0]);
$fields->addColumn('field_norm', 'float', ['default' => 0]);
$fields->addColumn('score', 'float', ['default' => 0]);
$fields->addColumn('frequency', 'integer', ['default' => 0]);
$fields->addColumn('idf', 'float', ['default' => 0]);

$fields->setPrimaryKey(['id']);
$fields->addForeignKeyConstraint(
Expand Down
16 changes: 13 additions & 3 deletions src/Component/Pucene/Dbal/ScoringAlgorithm.php
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ class ScoringAlgorithm
*/
private $docCount;

/**
* @var int[]
*/
private $docCounts = [];

/**
* @param PuceneQueryBuilder $queryBuilder
* @param PuceneSchema $schema
Expand Down Expand Up @@ -108,13 +113,18 @@ public function inverseDocumentFrequency(ElementInterface $element): float
*
* @return float
*/
private function calculateInverseDocumentFrequency($docCount)
public function calculateInverseDocumentFrequency($docCount)
{
return 1 + log((float) $this->getDocCount() / ($docCount + 1));
}

private function getDocCountForElement(ElementInterface $element)
{
$key = $element->getField() . $element->getTerm();
if (array_key_exists($key, $this->docCounts)) {
return $this->docCounts[$key];
}

$queryBuilder = (new PuceneQueryBuilder($this->queryBuilder->getConnection(), $this->schema))
->select('count(document.id) as count')
->from($this->schema->getDocumentsTableName(), 'document');
Expand All @@ -124,10 +134,10 @@ private function getDocCountForElement(ElementInterface $element)
$queryBuilder->where($expression);
}

return (int) $queryBuilder->execute()->fetchColumn();
return $this->docCounts[$key] = (int)$queryBuilder->execute()->fetchColumn();
}

private function getDocCount()
public function getDocCount()
{
if ($this->docCount) {
return $this->docCount;
Expand Down
5 changes: 5 additions & 0 deletions src/Component/Pucene/PuceneIndex.php
Original file line number Diff line number Diff line change
Expand Up @@ -96,4 +96,9 @@ public function get($type, $id)
{
return $this->storage->get($type, $id);
}

public function optimize()
{
$this->storage->optimize();
}
}
2 changes: 2 additions & 0 deletions src/Component/Pucene/StorageInterface.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,6 @@ public function search(Search $search, $type);
public function get($type, $id);

public function termStatistics();

public function optimize();
}
4 changes: 4 additions & 0 deletions tests/src/TestBundle/Command/ImportJsonCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

namespace Pucene\Tests\TestBundle\Command;

use Pucene\Component\Client\ClientInterface;
use Symfony\Bundle\FrameworkBundle\Command\ContainerAwareCommand;
use Symfony\Component\Console\Helper\ProgressBar;
use Symfony\Component\Console\Input\InputArgument;
Expand Down Expand Up @@ -30,6 +31,7 @@ protected function configure()
*/
protected function execute(InputInterface $input, OutputInterface $output)
{
/** @var ClientInterface $client */
$client = $this->getContainer()->get('pucene.' . $input->getOption('adapter') . '.client');
$index = $client->get($input->getArgument('index'));

Expand All @@ -44,6 +46,8 @@ protected function execute(InputInterface $input, OutputInterface $output)
$progressBar->advance();
}

$index->optimize();

$progressBar->finish();
}
}
2 changes: 2 additions & 0 deletions tests/src/TestBundle/Command/ImportWikidataCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ protected function execute(InputInterface $input, OutputInterface $output)
$progressBar->advance();
}

$index->optimize();

$progressBar->finish();
}
}

0 comments on commit f31a6bc

Please sign in to comment.