Skip to content
This repository has been archived by the owner on Dec 25, 2022. It is now read-only.

Commit

Permalink
tmp
Browse files Browse the repository at this point in the history
  • Loading branch information
wachterjohannes committed Jun 11, 2017
1 parent b4f818e commit adabc78
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 19 deletions.
48 changes: 41 additions & 7 deletions src/Component/Pucene/Dbal/DocumentPersister.php
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ public function __construct(Connection $connection, PuceneSchema $schema)
public function persist(Document $document, array $fields)
{
$this->insertDocument($document);
$docCount = $this->connection->createQueryBuilder()
->select('COUNT(id)')
->from($this->schema->getDocumentsTableName())
->execute()
->fetchColumn();

foreach ($fields as $field) {
$fieldTerms = [];
Expand All @@ -56,19 +61,37 @@ public function persist(Document $document, array $fields)
$token->getEncodedTerm(),
ElasticsearchPrecision::fieldNorm($field->getNumberOfTerms())
);

$this->connection->createQueryBuilder()
->update($this->schema->getDocumentTermsTableName())
->set('frequency', 'frequency + 1')
->andWhere('field_name = :fieldName')
->andWhere('term = :term')
->setParameter('fieldName', $field->getName())
->setParameter('term', $token->getEncodedTerm())
->execute();
}

// update term frequency
foreach ($fieldTerms as $term => $frequency) {
$this->connection->update(
$this->schema->getDocumentTermsTableName(),
[
'term_frequency' => sqrt($frequency),
],
['document_id' => $document->getId(), 'field_name' => $field->getName(), 'term' => $term]
);
$this->connection->createQueryBuilder()
->update($this->schema->getDocumentTermsTableName())
->set('term_frequency', sqrt($frequency))
->set('score', 'field_norm * ' . sqrt($frequency))
->andWhere('document_ID = :document')
->andWhere('field_name = :fieldName')
->andWhere('term = :term')
->setParameter('document', $document->getId())
->setParameter('fieldName', $field->getName())
->setParameter('term', $term)
->execute();
}
}

$this->connection->createQueryBuilder()
->update($this->schema->getDocumentTermsTableName())
->set('idf', '1 + log('.$docCount.' / (frequency + 1))')
->execute();
}

/**
Expand Down Expand Up @@ -101,13 +124,24 @@ protected function insertDocument(Document $document)
*/
protected function insertToken(string $documentId, string $fieldName, $term, $fieldNorm)
{
$frequency = $this->connection->createQueryBuilder()
->select('frequency')
->from($this->schema->getDocumentTermsTableName())
->andWhere('field_name = :fieldName')
->andWhere('term = :term')
->setParameter('fieldName', $fieldName)
->setParameter('term', $term)
->execute()
->fetchColumn();

$this->connection->insert(
$this->schema->getDocumentTermsTableName(),
[
'document_id' => $documentId,
'field_name' => $fieldName,
'term' => $term,
'field_norm' => $fieldNorm,
'frequency' => $frequency ?: 0,
]
);
}
Expand Down
24 changes: 15 additions & 9 deletions src/Component/Pucene/Dbal/Interpreter/Element/TermInterpreter.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ public function interpret(ElementInterface $element, PuceneQueryBuilder $queryBu
$expr = $queryBuilder->expr();
$name = $queryBuilder->joinTerm($element->getField(), $element->getTerm());

$queryBuilder->addSelect(sprintf('(%1$s.term_frequency * %1$s.field_norm) as %1$sValue', $name));
$queryBuilder->addSelect(sprintf('%1$s.id as %1$sId', $name));
$queryBuilder->addSelect(sprintf('(%1$s.score) as %1$sValue', $name));
$queryBuilder->addSelect(sprintf('%1$s.idf as %1$sIdf', $name));

return $expr->isNotNull($name . '.id');
}
Expand All @@ -43,18 +43,24 @@ public function scoring(ElementInterface $element, ScoringAlgorithm $scoring, $q
*/
public function newScoring(ElementInterface $element, ScoringAlgorithm $scoring, array $row, $queryNorm = null)
{
$idf = $scoring->inverseDocumentFrequency($element);
$termName = 'term' . ucfirst($element->getField()) . ucfirst($element->getTerm());
$idfName = $termName . 'Idf';
$valueName = $termName . 'Value';
if (!array_key_exists($valueName, $row)
|| !array_key_exists($idfName, $row)
|| $row[$valueName] === null
|| $row[$idfName] === null
) {
return 0;
}

$idf = $row[$idfName];
$factor = $idf * $element->getBoost();
if ($queryNorm) {
$factor *= $idf * $queryNorm;
}

$termName = 'term' . ucfirst($element->getField()) . ucfirst($element->getTerm()) . 'Value';
if (!array_key_exists($termName, $row) || $row[$termName] === null) {
return 0;
}

return $row[$termName] * $factor;
return $row[$valueName] * $factor;
}

/**
Expand Down
3 changes: 3 additions & 0 deletions src/Component/Pucene/Dbal/PuceneSchema.php
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ private function createDocumentTermsTable()
$fields->addColumn('term', 'string', ['length' => 255]);
$fields->addColumn('term_frequency', 'float', ['default' => 0]);
$fields->addColumn('field_norm', 'float', ['default' => 0]);
$fields->addColumn('score', 'float', ['default' => 0]);
$fields->addColumn('frequency', 'integer', ['default' => 0]);
$fields->addColumn('idf', 'float', ['default' => 0]);

$fields->setPrimaryKey(['id']);
$fields->addForeignKeyConstraint(
Expand Down
16 changes: 13 additions & 3 deletions src/Component/Pucene/Dbal/ScoringAlgorithm.php
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ class ScoringAlgorithm
*/
private $docCount;

/**
* @var int[]
*/
private $docCounts = [];

/**
* @param PuceneQueryBuilder $queryBuilder
* @param PuceneSchema $schema
Expand Down Expand Up @@ -108,13 +113,18 @@ public function inverseDocumentFrequency(ElementInterface $element): float
*
* @return float
*/
private function calculateInverseDocumentFrequency($docCount)
public function calculateInverseDocumentFrequency($docCount)
{
return 1 + log((float) $this->getDocCount() / ($docCount + 1));
}

private function getDocCountForElement(ElementInterface $element)
{
$key = $element->getField() . $element->getTerm();
if (array_key_exists($key, $this->docCounts)) {
return $this->docCounts[$key];
}

$queryBuilder = (new PuceneQueryBuilder($this->queryBuilder->getConnection(), $this->schema))
->select('count(document.id) as count')
->from($this->schema->getDocumentsTableName(), 'document');
Expand All @@ -124,10 +134,10 @@ private function getDocCountForElement(ElementInterface $element)
$queryBuilder->where($expression);
}

return (int) $queryBuilder->execute()->fetchColumn();
return $this->docCounts[$key] = (int)$queryBuilder->execute()->fetchColumn();
}

private function getDocCount()
public function getDocCount()
{
if ($this->docCount) {
return $this->docCount;
Expand Down

0 comments on commit adabc78

Please sign in to comment.