@article {IOPORT.06113546, author = {Burcsi, P\'eter and Cicalese, Ferdinando and Fici, Gabriele and Lipt\'ak, Zsuzsanna}, title = {On approximate jumbled pattern matching in strings.}, year = {2012}, journal = {Theory of Computing Systems}, volume = {50}, number = {1}, issn = {1432-4350}, pages = {35-51}, publisher = {Springer-Verlag, New York, NY}, doi = {10.1007/s00224-011-9344-5}, abstract = {Summary: Given a string $s$, the Parikh vector of $s$, denoted $p(s)$, counts the multiplicity of each character in $s$. Searching for a match of a Parikh vector $q$ in the text $s$ requires finding a substring $t$ of $s$ with $p(t)=q$. This can be viewed as the task of finding a jumbled (permuted) version of a query pattern, hence the term jumbled pattern matching. We present several algorithms for the approximate version of the problem: Given a string $s$ and two Parikh vectors $u,v$ (the query bounds), find all maximal occurrences in $s$ of some Parikh vector $q$ such that $u\leq q\leq v$. This definition encompasses several natural versions of approximate Parikh vector search. We present an algorithm solving this problem in sub-linear expected time using a wavelet tree of $s$, which can be computed in time $O(n)$ in a preprocessing phase. We then discuss a Scrabble-like variation of the problem, in which a weight function on the letters of $s$ is given and one has to find all occurrences in $s$ of a substring $t$ with maximum weight having Parikh vector $p(t)\leq v$. For the case of a binary alphabet, we present an algorithm which solves the decision version of the approximate jumbled pattern matching problem in constant time, by indexing the string in subquadratic time.}, identifier = {06113546}, }