Merge pull request #135

Text statistics
This commit is contained in:
Ibrahima G. Coulibaly
2025-06-05 23:04:31 +01:00
committed by GitHub
7 changed files with 492 additions and 25 deletions

View File

@@ -1,20 +1,9 @@
import { itemCounter } from '@utils/string';
export type SplitOperatorType = 'symbol' | 'regex';
export type DisplayFormat = 'count' | 'percentage' | 'total';
export type SortingMethod = 'count' | 'alphabetic';
// Function that takes the array as arg and returns a dict of element occurrences and handle the ignoreItemCase
function dictMaker(
array: string[],
ignoreItemCase: boolean
): { [key: string]: number } {
const dict: { [key: string]: number } = {};
for (const item of array) {
const key = ignoreItemCase ? item.toLowerCase() : item;
dict[key] = (dict[key] || 0) + 1;
}
return dict;
}
// Function that sorts the dict created with dictMaker based on the chosen sorting method
function dictSorter(
dict: { [key: string]: number },
@@ -74,21 +63,27 @@ export function TopItemsList(
sortingMethod: SortingMethod,
displayFormat: DisplayFormat,
splitSeparator: string,
input: string,
input: string | string[],
deleteEmptyItems: boolean,
ignoreItemCase: boolean,
trimItems: boolean
): string {
if (!input) return '';
let array: string[];
switch (splitOperatorType) {
case 'symbol':
array = input.split(splitSeparator);
break;
case 'regex':
array = input
.split(new RegExp(splitSeparator))
.filter((item) => item !== '');
break;
if (typeof input === 'string') {
switch (splitOperatorType) {
case 'symbol':
array = input.split(splitSeparator);
break;
case 'regex':
array = input
.split(new RegExp(splitSeparator))
.filter((item) => item !== '');
break;
}
} else {
array = input;
}
// Trim items if required
@@ -102,7 +97,7 @@ export function TopItemsList(
}
// Transform the array into dict
const unsortedDict = dictMaker(array, ignoreItemCase);
const unsortedDict = itemCounter(array, ignoreItemCase);
// Sort the list if required
const sortedDict = dictSorter(unsortedDict, sortingMethod);

View File

@@ -15,6 +15,7 @@ import { tool as stringReplace } from './text-replacer/meta';
import { tool as stringRepeat } from './repeat/meta';
import { tool as stringTruncate } from './truncate/meta';
import { tool as stringBase64 } from './base64/meta';
import { tool as stringStatistic } from './statistic/meta';
export const stringTools = [
stringSplit,
@@ -33,5 +34,6 @@ export const stringTools = [
stringQuote,
stringRotate,
stringRot13,
stringBase64
stringBase64,
stringStatistic
];

View File

@@ -0,0 +1,298 @@
import { Box } from '@mui/material';
import { useState } from 'react';
import ToolTextResult from '@components/result/ToolTextResult';
import { GetGroupsType } from '@components/options/ToolOptions';
import { textStatistics } from './service';
import ToolTextInput from '@components/input/ToolTextInput';
import { InitialValuesType } from './types';
import ToolContent from '@components/ToolContent';
import { CardExampleType } from '@components/examples/ToolExamples';
import { ToolComponentProps } from '@tools/defineTool';
import TextFieldWithDesc from '@components/options/TextFieldWithDesc';
import CheckboxWithDesc from '@components/options/CheckboxWithDesc';
const initialValues: InitialValuesType = {
emptyLines: false,
sentenceDelimiters: '',
wordDelimiters: '',
characterCount: false,
wordCount: false
};
const exampleCards: CardExampleType<InitialValuesType>[] = [
{
title: 'Text Statistics without any Flag',
description:
'This example shows basic text statistics without any additional flags.',
sampleText:
'Giraffes have long necks that can be up to 6 feet (1.8 meters) long, but they only have 7 neck vertebrae, the same as humans.',
sampleResult: `Text Statistics
==================
Characters: 125
Words: 26
Lines: 1
Sentences: 1
Paragraphs: 1`,
sampleOptions: initialValues
},
{
title: 'Text Statistics with Characters Frequency',
description:
'This example shows basic text statistics with characters frequency.',
sampleText: `The Great Barrier Reef is the world's largest coral reef system, located off the coast of Australia. It consists of over 2,900 individual reefs and 900 islands. The reef is home to thousands of species of marine life, including fish, sea turtles, sharks, and dolphins. It is also a popular tourist destination, attracting millions of visitors every year. However, the reef is facing many threats, including climate change, pollution, and overfishing. Conservation efforts are being made to protect this unique and valuable ecosystem for future generations.`,
sampleResult: `Text Statistics
==================
Characters: 556
Words: 87
Lines: 1
Sentences: 1
Paragraphs: 1
Characters Frequency
==================
0: 4 (0.72%)
2: 1 (0.18%)
9: 2 (0.36%)
␣: 85 (15.29%)
e: 51 (9.17%)
i: 40 (7.19%)
s: 40 (7.19%)
t: 39 (7.01%)
a: 37 (6.65%)
o: 34 (6.12%)
r: 33 (5.94%)
n: 29 (5.22%)
l: 21 (3.78%)
f: 20 (3.60%)
h: 15 (2.70%)
d: 15 (2.70%)
c: 14 (2.52%)
u: 14 (2.52%)
,: 11 (1.98%)
g: 10 (1.80%)
m: 8 (1.44%)
v: 8 (1.44%)
.: 6 (1.08%)
p: 6 (1.08%)
y: 5 (0.90%)
b: 3 (0.54%)
w: 2 (0.36%)
': 1 (0.18%)
k: 1 (0.18%)
q: 1 (0.18%)`,
sampleOptions: {
emptyLines: false,
sentenceDelimiters: '',
wordDelimiters: '',
characterCount: true,
wordCount: false
}
},
{
title: 'Text Statistics with Characters and Words Frequencies',
description:
'This example shows basic text statistics with characters and words frequencies.',
sampleText: `The Great Barrier Reef is the world's largest coral reef system, located off the coast of Australia. It consists of over 2,900 individual reefs and 900 islands. The reef is home to thousands of species of marine life, including fish, sea turtles, sharks, and dolphins. It is also a popular tourist destination, attracting millions of visitors every year. However, the reef is facing many threats, including climate change, pollution, and overfishing. Conservation efforts are being made to protect this unique and valuable ecosystem for future generations.`,
sampleResult: `Text Statistics
==================
Characters: 556
Words: 87
Lines: 1
Sentences: 1
Paragraphs: 1
Words Frequency
==================
2: 1 (1.15%)
900: 2 (2.30%)
the: 5 (5.75%)
of: 5 (5.75%)
reef: 4 (4.60%)
is: 4 (4.60%)
and: 4 (4.60%)
it: 2 (2.30%)
to: 2 (2.30%)
including: 2 (2.30%)
great: 1 (1.15%)
barrier: 1 (1.15%)
world's: 1 (1.15%)
largest: 1 (1.15%)
coral: 1 (1.15%)
system: 1 (1.15%)
located: 1 (1.15%)
off: 1 (1.15%)
coast: 1 (1.15%)
australia: 1 (1.15%)
consists: 1 (1.15%)
over: 1 (1.15%)
individual: 1 (1.15%)
reefs: 1 (1.15%)
islands: 1 (1.15%)
home: 1 (1.15%)
thousands: 1 (1.15%)
species: 1 (1.15%)
marine: 1 (1.15%)
life: 1 (1.15%)
fish: 1 (1.15%)
sea: 1 (1.15%)
turtles: 1 (1.15%)
sharks: 1 (1.15%)
dolphins: 1 (1.15%)
also: 1 (1.15%)
a: 1 (1.15%)
popular: 1 (1.15%)
tourist: 1 (1.15%)
destination: 1 (1.15%)
attracting: 1 (1.15%)
millions: 1 (1.15%)
visitors: 1 (1.15%)
every: 1 (1.15%)
year: 1 (1.15%)
however: 1 (1.15%)
facing: 1 (1.15%)
many: 1 (1.15%)
threats: 1 (1.15%)
climate: 1 (1.15%)
change: 1 (1.15%)
pollution: 1 (1.15%)
overfishing: 1 (1.15%)
conservation: 1 (1.15%)
efforts: 1 (1.15%)
are: 1 (1.15%)
being: 1 (1.15%)
made: 1 (1.15%)
protect: 1 (1.15%)
this: 1 (1.15%)
unique: 1 (1.15%)
valuable: 1 (1.15%)
ecosystem: 1 (1.15%)
for: 1 (1.15%)
future: 1 (1.15%)
generations: 1 (1.15%)
Characters Frequency
==================
0: 4 (0.72%)
2: 1 (0.18%)
9: 2 (0.36%)
␣: 85 (15.29%)
e: 51 (9.17%)
i: 40 (7.19%)
s: 40 (7.19%)
t: 39 (7.01%)
a: 37 (6.65%)
o: 34 (6.12%)
r: 33 (5.94%)
n: 29 (5.22%)
l: 21 (3.78%)
f: 20 (3.60%)
h: 15 (2.70%)
d: 15 (2.70%)
c: 14 (2.52%)
u: 14 (2.52%)
,: 11 (1.98%)
g: 10 (1.80%)
m: 8 (1.44%)
v: 8 (1.44%)
.: 6 (1.08%)
p: 6 (1.08%)
y: 5 (0.90%)
b: 3 (0.54%)
w: 2 (0.36%)
': 1 (0.18%)
k: 1 (0.18%)
q: 1 (0.18%)`,
sampleOptions: {
emptyLines: false,
sentenceDelimiters: '',
wordDelimiters: '',
characterCount: true,
wordCount: true
}
}
];
export default function Truncate({
title,
longDescription
}: ToolComponentProps) {
const [input, setInput] = useState<string>('');
const [result, setResult] = useState<string>('');
function compute(initialValues: InitialValuesType, input: string) {
setResult(textStatistics(input, initialValues));
}
const getGroups: GetGroupsType<InitialValuesType> = ({
values,
updateField
}) => [
{
title: 'Delimiters Options',
component: (
<Box>
<TextFieldWithDesc
value={values.sentenceDelimiters}
onOwnChange={(val) => updateField('sentenceDelimiters', val)}
placeholder="e.g. ., !, ?, ..."
description={
'Enter custom characters used to delimit sentences in your language (separated by comma) or leave it blank for default.'
}
/>
<TextFieldWithDesc
value={values.wordDelimiters}
onOwnChange={(val) => updateField('wordDelimiters', val)}
placeholder="eg. \\s.,;:!?\”«»()…"
description={
'Enter custom Regex to count Words or leave it blank for default.'
}
/>
</Box>
)
},
{
title: 'Statistics Options',
component: (
<Box>
<CheckboxWithDesc
checked={values.wordCount}
onChange={(value) => updateField('wordCount', value)}
title="Word Frequency Analysis"
description="Count how often each word appears in the text"
/>
<CheckboxWithDesc
checked={values.characterCount}
onChange={(value) => updateField('characterCount', value)}
title="Character Frequency Analysis"
description="Count how often each character appears in the text"
/>
<CheckboxWithDesc
checked={values.emptyLines}
onChange={(value) => updateField('emptyLines', value)}
title="Include Empty Lines"
description="Include blank lines when counting lines"
/>
</Box>
)
}
];
return (
<ToolContent
title={title}
initialValues={initialValues}
getGroups={getGroups}
compute={compute}
input={input}
setInput={setInput}
inputComponent={
<ToolTextInput title={'Input text'} value={input} onChange={setInput} />
}
resultComponent={
<ToolTextResult title={'Text Statistics'} value={result} />
}
toolInfo={{ title: `What is a ${title}?`, description: longDescription }}
exampleCards={exampleCards}
/>
);
}

View File

@@ -0,0 +1,15 @@
import { defineTool } from '@tools/defineTool';
import { lazy } from 'react';
export const tool = defineTool('string', {
name: 'Text Statistics',
path: 'statistics',
shortDescription: 'Get statistics about your text',
icon: 'fluent:document-landscape-data-24-filled',
description:
'Load your text in the input form on the left and you will automatically get statistics about your text on the right.',
longDescription:
'This tool provides various statistics about the text you input, including the number of lines, words, and characters. You can also choose to include empty lines in the count. it can count words and characters based on custom delimiters, allowing for flexible text analysis. Additionally, it can provide frequency statistics for words and characters, helping you understand the distribution of terms in your text.',
keywords: ['text', 'statistics', 'count', 'lines', 'words', 'characters'],
component: lazy(() => import('./index'))
});

View File

@@ -0,0 +1,108 @@
import { InitialValuesType } from './types';
import { TopItemsList } from '../../list/find-most-popular/service';
function countLines(text: string, options: InitialValuesType): number {
const numberofLines = options.emptyLines
? text.split('\n').length
: text.split('\n').filter((line) => line.trim() !== '').length;
return numberofLines;
}
function countCharacters(text: string): number {
return text.length;
}
function countSentences(text: string, options: InitialValuesType): number {
const sentenceDelimiters = options.sentenceDelimiters
? options.sentenceDelimiters.split(',').map((s) => s.trim())
: ['.', '!', '?', '...'];
const regex = new RegExp(`[${sentenceDelimiters.join('')}]`, 'g');
const sentences = text
.split(regex)
.filter((sentence) => sentence.trim() !== '');
return sentences.length;
}
function wordsStats(
text: string,
options: InitialValuesType
): [number, string] {
const defaultDelimiters = `\\s.,;:!?"“”«»()…`;
const wordDelimiters = options.wordDelimiters || defaultDelimiters;
const regex = new RegExp(`[${wordDelimiters}]`, 'gu');
const words = text.split(regex).filter((word) => word.trim() !== '');
const wordsFrequency = TopItemsList(
'regex',
'count',
'percentage',
'',
words,
false,
true,
false
);
return options.wordCount
? [words.length, wordsFrequency]
: [words.length, ''];
}
function countParagraphs(text: string): number {
return text
.split(/\r?\n\s*\r?\n/)
.filter((paragraph) => paragraph.trim() !== '').length;
}
function charactersStatistic(text: string, options: InitialValuesType): string {
if (!options.characterCount) return '';
const result = TopItemsList(
'symbol',
'count',
'percentage',
'',
text,
true,
true,
false
);
return result;
}
export function textStatistics(
input: string,
options: InitialValuesType
): string {
if (!input) return '';
const numberofLines = countLines(input, options);
const numberofCharacters = countCharacters(input);
const numberofSentences = countSentences(input, options);
const [numberofWords, wordsFrequency] = wordsStats(input, options);
const numberofParagraphs = countParagraphs(input);
const characterStats = charactersStatistic(input, options);
const stats = `Text Statistics
==================
Characters: ${numberofCharacters}
Words: ${numberofWords}
Lines: ${numberofLines}
Sentences: ${numberofSentences}
Paragraphs: ${numberofParagraphs}`;
const charStats = `Characters Frequency
==================
${characterStats}`;
const wordStatsOutput = `Words Frequency
==================
${wordsFrequency}`;
let result = stats;
if (options.wordCount) result += `\n\n${wordStatsOutput}`;
if (options.characterCount) result += `\n\n${charStats}`;
return result;
}

View File

@@ -0,0 +1,7 @@
export type InitialValuesType = {
emptyLines: boolean;
sentenceDelimiters: string;
wordDelimiters: string;
characterCount: boolean;
wordCount: boolean;
};

View File

@@ -1,5 +1,24 @@
import { UpdateField } from '@components/options/ToolOptions';
// Here starting the shared values for string manipulation.
/**
* This map is used to replace special characters with their visual representations.
* It is useful for displaying strings in a more readable format, especially in tools
**/
export const specialCharMap: { [key: string]: string } = {
'': '␀',
' ': '␣',
'\n': '↲',
'\t': '⇥',
'\r': '␍',
'\f': '␌',
'\v': '␋'
};
// Here starting the utility functions for string manipulation.
export function capitalizeFirstLetter(string: string | undefined) {
if (!string) return '';
return string.charAt(0).toUpperCase() + string.slice(1);
@@ -63,3 +82,26 @@ export function unquoteIfQuoted(value: string, quoteCharacter: string): string {
}
return value;
}
/**
* Count the occurence of items.
* @param array - array get from user with a custom delimiter.
* @param ignoreItemCase - boolean status to ignore the case i .
* @returns Dict of Items count {[Item]: occcurence}.
*/
export function itemCounter(
array: string[],
ignoreItemCase: boolean
): { [key: string]: number } {
const dict: { [key: string]: number } = {};
for (const item of array) {
let key = ignoreItemCase ? item.toLowerCase() : item;
if (key in specialCharMap) {
key = specialCharMap[key];
}
dict[key] = (dict[key] || 0) + 1;
}
return dict;
}