Lightweight NLP library for text parsing, extraction, and transformation in JavaScript.
<i>↬<sub>ᔐᖜ</sub><b>↬</b></i> <sub></sub> and how hard it is to actually <b>parse</b> and <i>use</i>?
</ul> </div> <!-- spacer --> <img height="45px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/> <div align="left"> <img height="10px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>compromise <i><a href="https://observablehq.com/@spencermountain/compromise-justification">tries its best</a></i> to turn text into data. <br/> <img height="30px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>it makes limited and sensible decisions. <br/> <sub > <img height="15px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/> it's not as smart as you'd think. </sub> <img height="45px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/> <!-- it is <a href="https://docs.compromise.cool/compromise-filesize">small, <a href="https://docs.compromise.cool/compromise-performance">quick</a>, and often <i><a href="https://docs.compromise.cool/compromise-accuracy">good-enough</a></i>. <br/> --> </div> <img height="30px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>import nlp from 'compromise'
let doc = nlp('she sells seashells by the seashore.')
doc.verbs().toPastTense()
doc.text()
// 'she sold seashells by the seashore.'
<!-- spacer -->
<img height="50px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
<div align="left">
<i>don't be fancy, at all:</i>
</div>
if (doc.has('simon says #Verb')) {
return true
}
<!-- spacer -->
<img height="30px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
<div align="center">
<img height="50px" src="https://user-images.githubusercontent.com/399657/68221814-05ed1680-ffb8-11e9-8b6b-c7528d163871.png"/>
</div>
<div align="left">
<i>grab parts of the text:</i>
</div>
let doc = nlp(entireNovel)
doc.match('the #Adjective of times').text()
// "the blurst of times?"
<div align="right">
<a href="https://docs.compromise.cool/compromise-match">match docs</a>
</div>
<div align="center">
<img height="50px" src="https://user-images.githubusercontent.com/399657/68221837-0d142480-ffb8-11e9-9d30-90669f1b897c.png"/>
</div>
<!-- spacer -->
<img height="30px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
<i>and get data:</i>
import plg from 'compromise-speech'
nlp.extend(plg)
let doc = nlp('Milwaukee has certainly had its share of visitors..')
doc.compute('syllables')
doc.places().json()
/*
[{
"text": "Milwaukee",
"terms": [{
"normal": "milwaukee",
"syllables": ["mil", "wau", "kee"]
}]
}]
*/
<div align="right">
<a href="https://docs.compromise.cool/compromise-json">json docs</a>
</div>
<div align="center">
<img height="50px" src="https://user-images.githubusercontent.com/399657/68221814-05ed1680-ffb8-11e9-8b6b-c7528d163871.png"/>
</div>
<!-- spacer -->
<img height="30px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
avoid the problems of brittle parsers:
let doc = nlp("we're not gonna take it..")
doc.has('gonna') // true
doc.has('going to') // true (implicit)
// transform
doc.contractions().expand()
doc.text()
// 'we are not going to take it..'
<div align="right">
<a href="https://docs.compromise.cool/compromise-contractions">contraction docs</a>
</div>
<div align="center">
<img height="50px" src="https://user-images.githubusercontent.com/399657/68221814-05ed1680-ffb8-11e9-8b6b-c7528d163871.png"/>
</div>
<!-- spacer -->
<img height="30" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
and whip stuff around like it's data:
let doc = nlp('ninety five thousand and fifty two')
doc.numbers().add(20)
doc.text()
// 'ninety five thousand and seventy two'
<div align="right">
<a href="https://docs.compromise.cool/compromise-values">number docs</a>
</div>
<div align="center">
<img height="50px" src="https://user-images.githubusercontent.com/399657/68221837-0d142480-ffb8-11e9-9d30-90669f1b897c.png"/>
</div>
<!-- spacer -->
<img height="30" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
<sub>-because it actually is-</sub>
let doc = nlp('the purple dinosaur')
doc.nouns().toPlural()
doc.text()
// 'the purple dinosaurs'
<div align="right">
<a href="https://docs.compromise.cool/nouns">noun docs</a>
</div>
<div align="center">
<img height="50px" src="https://user-images.githubusercontent.com/399657/68221731-e8b84800-ffb7-11e9-8453-6395e0e903fa.png"/>
</div>
<!-- spacer -->
<img height="50px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
Use it on the client-side:
<script src="https://unpkg.com/compromise"></script>
<script>
var doc = nlp('two bottles of beer')
doc.numbers().minus(1)
document.body.innerHTML = doc.text()
// 'one bottle of beer'
</script>
or likewise:
import nlp from 'compromise'
var doc = nlp('London is calling')
doc.verbs().toNegative()
// 'London is not calling'
<img height="75px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
<!--
bragging graphs
-->
<!-- spacer -->
<img height="30" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
compromise is ~250kb (minified):
<div align="center"> <!-- filesize --> <a href="https://bundlephobia.com/result?p=compromise"> <img width="600" src="https://user-images.githubusercontent.com/399657/68234819-14dfc300-ffd0-11e9-8b30-cb8545707b29.png"/> </a> </div>it's pretty fast. It can run on keypress:
<div align="center"> <a href="https://observablehq.com/@spencermountain/compromise-performance"> <img width="600" src="https://user-images.githubusercontent.com/399657/159795115-ed62440a-be41-424c-baa4-8dd15c48377d.png"/> </a> </div>it works mainly by <a href="https://observablehq.com/@spencermountain/verbs">conjugating all forms</a> of a basic word list.
The final lexicon is <a href="https://observablehq.com/@spencermountain/compromise-lexicon">~14,000 words</a>:
<div align="center"> <img width="600" src="https://user-images.githubusercontent.com/399657/68234805-0d201e80-ffd0-11e9-8dc6-f7a600352555.png"/> </div>you can read more about how it works, here. it's weird.
<!-- spacer --> <img height="75px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/> <!-- one/two/three parts --> <p align="left"> <sub>okay -</sub> <h1> <code>compromise/one</code> </h1> <p align="center">A <code>tokenizer</code> of words, sentences, and punctuation.</p> <img height="15px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/> <p>import nlp from 'compromise/one'
let doc = nlp("Wayne's World, party time")
let data = doc.json()
/* [{
normal:"wayne's world party time",
terms:[{ text: "Wayne's", normal: "wayne" },
...
]
}]
*/
<div align="right">
<a href="https://docs.compromise.cool/compromise-tokenization">tokenizer docs</a>
</div>
<b>compromise/one</b> splits your text up, wraps it in a handy API,
<ul> <sub>and does nothing else -</sub> </ul> <img height="25px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/><b>/one</b> is quick - most sentences take a 10th of a millisecond.
It can do <b>~1mb</b> of text a second - or 10 wikipedia pages.
<i>Infinite jest</i> takes 3s.
<div align="right"> You can also parallelize, or stream text to it with <a href="https://github.com/spencermountain/compromise/tree/master/plugins/speed">compromise-speed</a>. </div> <!-- spacer --> <img height="60px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/> <!-- two --> <p align="center"> <h1 align="left"> <code>compromise/two</code> </h1> <p align="center">A <code>part-of-speech</code> tagger, and grammar-interpreter.</p> <img height="15px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/> <p>import nlp from 'compromise/two'
let doc = nlp("Wayne's World, party time")
let str = doc.match('#Possessive #Noun').text()
// "Wayne's World"
<div align="right">
<a href="https://docs.compromise.cool/compromise-tagger">tagger docs</a>
</div>
<p>
<img height="25px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
</p>
<b>compromise/two</b> automatically calculates the very basic grammar of each word.
<sub>this is more useful than people sometimes realize.</sub>
Light grammar helps you write cleaner templates, and get closer to the information.
<!-- Part-of-speech tagging is profoundly-difficult task to get 100% on. It is also a profoundly easy task to get 85% on. --> <img height="50px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>compromise has <b>83 tags</b>, arranged in <a href="https://observablehq.com/@spencermountain/compromise-tags">a handsome graph</a>.
<b>#FirstName</b> → <b>#Person</b> → <b>#ProperNoun</b> → <b>#Noun</b>
you can see the grammar of each word by running doc.debug()
you can see the reasoning for each tag with nlp.verbose('tagger').
if you prefer <a href="https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html"><i>Penn tags</i></a>, you can derive them with:
let doc = nlp('welcome thrillho')
doc.compute('penn')
doc.json()
<img height="60px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
<!-- three -->
<p align="center">
<h1 align="left">
<code>compromise/three</code>
</h1>
<p align="center"><code>Phrase</code> and sentence tooling.</p>
<img height="15px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
<p>
import nlp from 'compromise/three'
let doc = nlp("Wayne's World, party time")
let str = doc.people().normalize().text()
// "wayne"
<div align="right">
<a href="https://docs.compromise.cool/compromise-selections">selection docs</a>
</div>
<b>compromise/three</b> is a set of tooling to <i>zoom into</i> and operate on parts of a text.
.numbers() grabs all the numbers in a document, for example - and extends it with new methods, like .subtract().
When you have a phrase, or group of words, you can see additional metadata about it with .json()
let doc = nlp('four out of five dentists')
console.log(doc.fractions().json())
/*[{
text: 'four out of five',
terms: [ [Object], [Object], [Object], [Object] ],
fraction: { numerator: 4, denominator: 5, decimal: 0.8 }
}
]*/
let doc = nlp('$4.09CAD')
doc.money().json()
/*[{
text: '$4.09CAD',
terms: [ [Object] ],
number: { prefix: '$', num: 4.09, suffix: 'cad'}
}
]*/
<img height="80px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
(match methods use the match-syntax.)
(these methods are on the main nlp object)
nlp.tokenize(str) - parse text without running POS-tagging
nlp.lazy(str, match) - scan through a text with minimal analysis
nlp.plugin({}) - mix in a compromise-plugin
nlp.parseMatch(str) - pre-parse any match statements into json
nlp.world() - grab or change library internals
nlp.model() - grab all current linguistic data
nlp.methods() - grab or change internal methods
nlp.hooks() - see which compute methods run automatically
nlp.verbose(mode) - log our decision-making for debugging
nlp.version - current semver version of the library
nlp.addWords(obj, isFrozen?) - add new words to the lexicon
nlp.addTags(obj) - add new tags to the tagSet
nlp.typeahead(arr) - add words to the auto-fill dictionary
nlp.buildTrie(arr) - compile a list of words into a fast lookup form
nlp.buildNet(arr) - compile a list of matches into a fast match form
'football captain' → 'football captains''turnovers' → 'turnover''will go' → 'went''walked' → 'walks''walked' → 'will walk''walks' → 'walk''walks' → 'walking''drive' → 'had driven''went' → 'did not go'"didn't study" → 'studied'5fivefifth or 5thfive or 5'$2.50'
he walks -> he walkedhe walked -> he walkshe walks -> he will walkhe walks -> he walkhe walks -> he didn't walk?!? or !'quick'
'wash-out''(939) 555-0113''#nlp''hi@compromise.cool':)💋'@nlp_compromise''compromise.cool''he''but''of''Mrs.'people() + places() + organizations()'quickly'
'FBI'
"Spencer's"
This library comes with a considerate, common-sense baseline for english grammar.
You're free to change, or lay-waste to any settings - which is the fun part actually.
the easiest part is just to suggest tags for any given words:
let myWords = {
kermit: 'FirstName',
fozzie: 'FirstName',
}
let doc = nlp(muppetText, myWords)
or make heavier changes with a compromise-plugin.
import nlp from 'compromise'
nlp.extend({
// add new tags
tags: {
Character: {
isA: 'Person',
notA: 'Adjective',
},
},
// add or change words in the lexicon
words: {
kermit: 'Character',
gonzo: 'Character',
},
// change inflections
irregulars: {
get: {
pastTense: 'gotten',
gerund: 'gettin',
},
},
// add new methods to compromise
api: View => {
View.prototype.kermitVoice = function () {
this.sentences().prepend('well,')
this.match('i [(am|was)]').prepend('um,')
return this
}
},
})
<div align="right">
<a href="https://docs.compromise.cool/compromise-plugins">.plugin() docs</a>
</div>
<div align="center">
<img height="50px" src="https://user-images.githubusercontent.com/399657/68221848-11404200-ffb8-11e9-90cd-3adee8d8564f.png"/>
</div>
<!-- spacer -->
<div >
<img height="50px" src="https://user-images.githubusercontent.com/399657/68221862-17ceb980-ffb8-11e9-87d4-7b30b6488f16.png"/>
</div>
README truncated (exceeds 50KB). View the full version at the source.
Preview
Markdown
[](https://attestry.ai/models/npm-compromise)HTML
<a href="https://attestry.ai/models/npm-compromise"><img src="https://regseal.ai/api/v1/registry/badge/npm-compromise" alt="RegSeal Verification Status" /></a>